當前位置：首頁 > 人文社科 > 生活经验 >内容正文

生活经验

如何在框架外部自定义C++ OP

發布時間：2023/11/28 生活经验 31 豆豆

生活随笔收集整理的這篇文章主要介紹了如何在框架外部自定义C++ OP 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

如何在框架外部自定義C++ OP
通常，如果PaddlePaddle的Operator(OP)庫中沒有所需要的操作，建議先嘗試使用已有的OP組合，如果無法組合出您需要的操作，可以嘗試使用paddle.static.py_func，也可以按照這篇教程自定義C++ OP。當然，如果用若干OP組合出來的OP性能無法滿足要求，也可以自定義C++ OP。
自定義OP需要以下幾個步驟:

實現OP和注冊OP，和在框架內部寫OP完全相同，遵守”如何寫新的C++ OP”的規范和步驟。當然，實現Gradient OP是可選的。
編譯出動態庫。
封裝該OP的Python接口。
寫OP的單測。
下面通過一個具體的例子來詳細的介紹，一步一步教會如何實現。下面通過實現relu op來介紹。
自定義OP的實現
OP的實現與”如何寫新的C++ OP”的教程相同，簡答的說需要: 1). 定義OP的ProtoMaker，即描述OP的輸入、輸出、屬性信息；2). 實現OP的定義和InferShape，以及OP的kernel函數，反向OP類似。3). 注冊OP，以及OP的計算函數。
ReLU OP的CPU實現， relu_op.cc 文件:
// relu_op.cc
#include “paddle/fluid/framework/op_registry.h”

namespace paddle {
namespace operators {

// 前向OP的輸入X、輸出Y、屬性
class Relu2OpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(“X”, “The input tensor.”);
AddOutput(“Y”, “Output of relu_op”);
AddComment(R"DOC(
Relu Operator.
Y = max(X, 0)
)DOC");
}
};

// 前向OP的定義和InferShape實現，設置輸出Y的shape
class Relu2Op : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim(“X”);
ctx->SetOutputDim(“Y”, in_dims);
}
};

// 實現前向OP的Kernel計算函數: Y = max(0, X)
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class Relu2Kernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_t = ctx.Input(“X”);
auto* out_t = ctx.Output(“Y”);
auto x = in_t->data();
// mutable_data分配內存、獲取指針
auto y = out_t->mutable_data(ctx.GetPlace());
for (int i = 0; i < in_t->numel(); ++i) {
y[i] = std::max(static_cast(0.), x[i]);
}
}
};

// 定義反向OP的輸入Y和dY、輸出dX、屬性:
template
class Relu2GradMaker : public framework::SingleGradOpMaker {
public:
using framework::SingleGradOpMaker::SingleGradOpMaker;

void Apply(GradOpPtr op) const override {
op->SetType(“relu2_grad”);
op->SetInput(“Y”, this->Output(“Y”));
op->SetInput(framework::GradVarName(“Y”), this->OutputGrad(“Y”));
op->SetAttrMap(this->Attrs());
op->SetOutput(framework::GradVarName(“X”), this->InputGrad(“X”));
}
};

// 定義反向OP和InferShape實現,設置dX的shape
class Relu2GradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim(framework::GradVarName(“Y”));
ctx->SetOutputDim(framework::GradVarName(“X”), in_dims);
}
};

// 實現反向OP的kernel函數 dx = dy * ( y > 0. ? 1. : 0)
template <typename DeviceContext, typename T>
class Relu2GradKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dy_t = ctx.Input(framework::GradVarName(“Y”));
auto* y_t = ctx.Input(“Y”);
auto* dx_t = ctx.Output(framework::GradVarName(“X”));

auto dy = dy_t->data<T>();
auto y = y_t->data<T>();
auto dx = dx_t->mutable_data<T>(ctx.GetPlace());for (int i = 0; i < y_t->numel(); ++i) {dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
}

}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
// 注冊前向和反向op
// 為了和框架內部的relu區分，這里注冊的OP type為relu2
REGISTER_OPERATOR(relu2,
ops::Relu2Op,
ops::Relu2OpMaker,
ops::Relu2GradMakerpaddle::framework::OpDesc,
ops::Relu2GradMakerpaddle::imperative::OpBase);
REGISTER_OPERATOR(relu2_grad, ops::Relu2GradOp);
// 注冊CPU的Kernel
REGISTER_OP_CPU_KERNEL(relu2,
ops::Relu2Kernel<CPU, float>,
ops::Relu2Kernel<CPU, double>);
REGISTER_OP_CPU_KERNEL(relu2_grad,
ops::Relu2GradKernel<CPU, float>,
ops::Relu2GradKernel<CPU, double>);
ReLU OP的GPU實現， relu_op.cu 文件:
// relu_op.cu
#include “paddle/fluid/framework/op_registry.h”

namespace paddle {
namespace operators {

using Tensor = framework::Tensor;

template
global void KeRelu2(const T* x, const int num, T* y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
y[i] = max(x[i], static_cast(0.));
}
}

// 前向OP的kernel的GPU實現
template <typename DeviceContext, typename T>
class Relu2CUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_t = ctx.Input(“X”);
auto* out_t = ctx.Output(“Y”);
auto x = in_t->data();
auto y = out_t->mutable_data(ctx.GetPlace());

auto& dev_ctx = ctx.template device_context<DeviceContext>();int num = in_t->numel();
int block = 512;
int grid = (num + block - 1) / block;
KeRelu2<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);

}
};

template
global void KeRelu2Grad(const T* y, const T* dy, const int num, T* dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
}
}

// 反向OP的kernel的GPU實現
template <typename DeviceContext, typename T>
class Relu2GradCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dy_t = ctx.Input(framework::GradVarName(“Y”));
auto* y_t = ctx.Input(“Y”);
auto* dx_t = ctx.Output(framework::GradVarName(“X”));

auto dy = dy_t->data<T>();
auto y = y_t->data<T>();
auto dx = dx_t->mutable_data<T>(ctx.GetPlace());auto& dev_ctx = ctx.template device_context<DeviceContext>();int num = dy_t->numel();
int block = 512;
int grid = (num + block - 1) / block;
KeRelu2Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);

}
};

} // namespace operators
} // namespace paddle

using CUDA = paddle::platform::CUDADeviceContext;
// 注冊前向的GPU Kernel
REGISTER_OP_CUDA_KERNEL(relu2,
paddle::operators::Relu2CUDAKernel<CUDA, float>,
paddle::operators::Relu2CUDAKernel<CUDA, double>);
// 注冊反向的GPU Kernel
REGISTER_OP_CUDA_KERNEL(relu2_grad,
paddle::operators::Relu2GradCUDAKernel<CUDA, float>,
paddle::operators::Relu2GradCUDAKernel<CUDA, double>);
注意點:

OP的type不能和PaddlePaddle已有的OP type相同，否則在Python中使用時會報錯。
自定義OP的編譯
需要將實現的C++、CUDA代碼編譯成動態庫，下面通過g++/nvcc編譯，也可以寫Makefile或者CMake。
編譯需要include PaddlePaddle的相關頭文件，如上面代碼 paddle/fluid/framework/op_registry.h ，需要鏈接PaddlePaddle的lib庫。可通過下面命令獲取到:

python

import paddle
print(paddle.sysconfig.get_include())
/paddle/pyenv/local/lib/python2.7/site-packages/paddle/include

print(paddle.sysconfig.get_lib())
/paddle/pyenv/local/lib/python2.7/site-packages/paddle/libs
下面命令可編譯出動態庫:
include_dir= $python -c 'import paddle; print(paddle.sysconfig.get_include())' ) lib_dir=$ ( python -c ‘import paddle; print(paddle.sysconfig.get_lib())’ )

echo $include_dir
echo $lib_dir

PaddlePaddel >=1.6.1, 僅需要include ${include_dir} 和 ${include_dir}/third_party

nvcc relu_op.cu -c -o relu_op.cu.o -ccbin cc -DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO -DPADDLE_WITH_MKLDNN -Xcompiler -fPIC -std=c++11 -Xcompiler -fPIC -w --expt-relaxed-constexpr -O3 -DNVCC
-I ${include_dir}
-I ${include_dir}/third_party \

g++ relu_op.cc relu_op.cu.o -o relu2_op.so -shared -fPIC -std=c++11 -O3 -DPADDLE_WITH_MKLDNN
-I ${include_dir}
-I ${include_dir}/third_party
-L /usr/local/cuda/lib64
-L ${lib_dir} -lpaddle_framework -lcudart
注意點:

通過NVCC編譯CUDA源文件時，需要加編譯選項 -DPADDLE_WITH_CUDA -DEIGEN_USE_GPU -DPADDLE_USE_DSO，在框架源碼中會使用這些宏定義進行條件編譯。用戶自定義的C++ OP實現編譯時，選項的開啟狀態需要和核心框架編譯行為一致。如EIGEN_USE_GPU是使用Eigen數學庫的GPU實現時需要增加的編譯選項。
如果飛槳安裝包中不包含MKLDNN庫，則需要去掉編譯選項-DPADDLE_WITH_MKLDNN。核心框架源碼中(比如tensor.h)有使用此宏定義進行條件編譯，該選項是否打開同樣需要和核心框架編譯行為保持一致。默認的飛槳安裝包中含有MKLDNN庫。
可多個OP編譯到同一個動態庫中。
通過pip方式安裝的PaddlePaddle由GCC 4.8編譯得到，由于GCC 4.8和GCC 5以上C++11 ABI不兼容，編寫的自定義OP，需要通過GCC 4.8編譯。若是GCC 5及以上的環境上使用自定義OP，推薦使用Docker安裝PaddlePaddle，使得編Paddle和編譯自定義OP的GCC版本相同。
封裝Python Layer接口
需要使用 paddle.incubate.load_op_library 接口調用加載動態庫，使得PaddlePaddle的主進程中可以使用用戶自定義的OP。

custom_op.py

import paddle.incubate as incubate

調用load_op_library加載動態庫

incubate.load_op_library(‘relu2_op.so’)

from paddle.incubate import LayerHelper

def relu2(x, name=None):
# relu2的type和在OP中定義的type相同
helper = LayerHelper(“relu2”, **locals())
# 創建輸出Variable
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type=“relu2”, inputs={“X”: x}, outputs={“Y”: out})
return out
注意點:

一個動態庫只需使用paddle.incubate.load_op_library在paddle import之后加載一次即可。
Python接口的封裝和PaddlePaddle框架內部的封裝相同，更多的示例也可以閱讀源碼中 python/paddle/fluid/layers/nn.py的代碼示例。
單測測試
可以寫個簡單的Python程序測試計算的正確性:
靜態圖模式
import numpy as np
import paddle
from custom_op import relu2

paddle.enable_static()
data = paddle.static.data(name=‘data’, shape=[None, 32], dtype=‘float32’)
relu = relu2(data)
use_gpu = True # or False
paddle.set_device(‘gpu’ if use_gpu else ‘cpu’)
exe = paddle.static.Executor()

x = np.random.uniform(-1, 1, [4, 32]).astype(‘float32’)
out, = exe.run(feed={‘data’: x}, fetch_list=[relu])
np.allclose(out, np.maximum(x, 0.))
動態圖模式
import numpy as np
import paddle
from custom_op import relu2

use_gpu = True # or False
paddle.set_device(‘gpu’ if use_gpu else ‘cpu’)

x = np.random.uniform(-1, 1, [4, 32]).astype(‘float32’)
t = paddle.to_tensor(x)
out = relu2(t)
np.allclose(out.numpy(), np.maximum(x, 0.))
接下來可以在模型中使用您自定義的OP了!
如何在C++預測庫中使用
暫時不支持在C++預測庫中使用，后續會補充在C++預測庫中的使用示例。
FAQ

Q: 如果出現類似錯誤: relu2_op.so: cannot open shared object file: No such file or directory 以及 libpaddle_framework.so: cannot open shared object file: No such file or directory。
A: 需要將relu2_op.so所在路徑以及libpaddle_framework.so路徑(即paddle.sysconfig.get_lib()得到路徑)設置到環境變量LD_LIBRARY_PATH中:

假如relu2_op.so路徑是：`paddle/test`，對于Linux環境設置:

export LD_LIBRARY_PATH=paddle/test: $python -c 'import paddle; print(paddle.sysconfig.get_lib())'):$ LD_LIBRARY_PATH

總結

以上是生活随笔為你收集整理的如何在框架外部自定义C++ OP的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：如何写新的Python OP
下一篇： Paddle Release Note

生活经验

如何在框架外部自定义C++ OP

python

PaddlePaddel >=1.6.1, 僅需要include ${include_dir} 和 ${include_dir}/third_party

custom_op.py

調用load_op_library加載動態庫

假如relu2_op.so路徑是：paddle/test，對于Linux環境設置:

總結

假如relu2_op.so路徑是：`paddle/test`，對于Linux環境設置: