日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪(fǎng)問(wèn) 生活随笔!

生活随笔

當(dāng)前位置: 首頁(yè) > 人文社科 > 生活经验 >内容正文

生活经验

Yolov4性能分析(下)

發(fā)布時(shí)間:2023/11/28 生活经验 23 豆豆
生活随笔 收集整理的這篇文章主要介紹了 Yolov4性能分析(下) 小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

Yolov4性能分析(下)

六. 權(quán)重更新

“darknet/src/detector.c”–train_detector()函數(shù)中:

    ....../* 開(kāi)始訓(xùn)練網(wǎng)絡(luò) */float loss

= 0;

#ifdef GPU

    if (ngpus

== 1) {

        int

wait_key = (dont_show) ? 0 : 1;

        loss =

train_network_waitkey(net, train, wait_key); // network.c中,train_network_waitkey函數(shù)入口,分配內(nèi)存并執(zhí)行網(wǎng)絡(luò)訓(xùn)練。

    }else {loss =

train_networks(nets, ngpus, train, 4); // network_kernels.cu中,train_networks函數(shù)入口,多GPU訓(xùn)練。

    }

#else

    loss =

train_network(net, train); // train_network_waitkey(net, d, 0),CPU模式。

#endif

    if

(avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss; // if(-inf or nan)

    avg_loss =

avg_loss*.9 + loss*.1;

    ......

以CPU訓(xùn)練為例,“darknet/src/network.c”–train_network()函數(shù),執(zhí)行train_network_waitkey(net, d, 0):

float train_network_waitkey(network net, data d, int
wait_key)

{

assert(d.X.rows

% net.batch == 0);

int batch =

net.batch; // detector.c中train_detector函數(shù)在nets[k] = parse_network_cfg(cfgfile)處調(diào)用parser.c中的parse_net_options函數(shù),有net->batch /=
subdivs,所以batch_size = batch/subdivisions。

int n =

d.X.rows / batch; // batch個(gè)數(shù), 對(duì)于單GPU和CPU,n =
subdivision。

float* X =

(float*)xcalloc(batch * d.X.cols, sizeof(float));

float* y =

(float*)xcalloc(batch * d.y.cols, sizeof(float));

int i;float sum = 0;for(i = 0; i

< n; ++i){

get_next_batch(d, batch, i*batch, X, y);

net.current_subdivision = i;

    float err =

train_network_datum(net, X, y); // 調(diào)用train_network_datum函數(shù)得到誤差Loss。

    sum += err;

if(wait_key) wait_key_cv(5);

}(*net.cur_iteration)

+= 1;

#ifdef GPU

update_network_gpu(net);

#else // GPU

update_network(net);

#endif // GPU

free(X);free(y);return

(float)sum/(n*batch);

}

其中,調(diào)用train_network_datum()函數(shù)計(jì)算error是核心:

float train_network_datum(network net, float *x, float
*y)

{

#ifdef GPU

if(gpu_index

= 0) return train_network_datum_gpu(net, x, y); // GPU模式,調(diào)用network_kernels.cu中train_network_datum_gpu函數(shù)。

#endif

network_state

state={0};

*net.seen +=

net.batch;

state.index =

0;

state.net =

net;

state.input =

x;

state.delta =

0;

state.truth =

y;

state.train =

1;

forward_network(net, state); // CPU模式,正向傳播。

backward_network(net, state); // CPU模式,BP。

float error =

get_network_cost(net); // 計(jì)算Loss。

return error;

}

進(jìn)一步分析forward_network()函數(shù):

void forward_network(network net, network_state state)

{

state.workspace

= net.workspace;

int i;for(i = 0; i

< net.n; ++i){

    state.index

= i;

    layer l =

net.layers[i];

    if(l.delta

&& state.train){

scal_cpu(l.outputs * l.batch, 0, l.delta, 1); // blas.c中,scal_cpu函數(shù)入口。

    }

l.forward(l, state); // 不同層l.forward代表不同函數(shù),如:convolutional_layer.c中,l.forward =
forward_convolutional_layer;yolo_layer.c中,l.forward = forward_yolo_layer,CPU執(zhí)行前向運(yùn)算。

    state.input

= l.output; // 上一層的輸出傳遞給下一層的輸入。

}

}

卷積層時(shí),forward_convolutional_layer()函數(shù):

void forward_convolutional_layer(convolutional_layer l,
network_state state)

{

/* 獲取卷積層輸出的長(zhǎng)寬。*/int out_h =

convolutional_out_height(l);

int out_w =

convolutional_out_width(l);

int i, j;

fill_cpu(l.outputs*l.batch, 0, l.output, 1); // 把output初始化為0。

/* xnor-net,將inputs和weights二值化。*/if (l.xnor

&& (!l.align_bit_weights || state.train)) {

    if

(!l.align_bit_weights || state.train) {

binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);

    }

swap_binary(&l);

binarize_cpu(state.input, l.cl.hl.w*l.batch, l.binary_input);

    state.input

= l.binary_input;

}/* m是卷積核的個(gè)數(shù),k是每個(gè)卷積核的參數(shù)數(shù)量(l.size是卷積核的大小),n是每個(gè)輸出feature map的像素個(gè)數(shù)。*/int m = l.n /

l.groups;

int k =

l.sizel.sizel.c / l.groups;

int n =

out_h*out_w;

static int u =

0;

u++;for(i = 0; i

< l.batch; ++i)

{for (j = 0;

j < l.groups; ++j)

    {/* weights是卷積核的參數(shù),a是指向權(quán)重的指針,b是指向工作空間指針,c是指向輸出的指針。*/float

a = l.weights +jl.nweights / l.groups;

        float

*b = state.workspace;

        float *c = l.output +(i*l.groups +

j)nm;

        if

(l.xnor && l.align_bit_weights && !state.train &&
l.stride_x == l.stride_y)

        {

memset(b, 0, l.bit_alignl.sizel.size*l.c * sizeof(float));

            if

(l.c % 32 == 0)

            {

int ldb_align = l.lda_align;

size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;

int re_packed_input_size = l.c * l.w * l.h;

memset(state.workspace, 0, re_packed_input_size * sizeof(float));

const size_t new_c = l.c / 32;

size_t in_re_packed_input_size = new_c * l.w * l.h + 1;

memset(l.bin_re_packed_input, 0, in_re_packed_input_size *
sizeof(uint32_t));

// float32x4 by channel (as in cuDNN)

repack_input(state.input, state.workspace, l.w, l.h, l.c);

// 32 x floats -> 1 x uint32_t

float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input,
l.c * l.w * l.h);

                /* image to column,就是將圖像依照卷積核的大小拉伸為列向量,方便矩陣運(yùn)算,將圖像每一個(gè)kernel轉(zhuǎn)換成一列。*/

im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w,
l.size, l.stride, l.pad, state.workspace);

int new_k = l.sizel.sizel.c / 32;

transpose_uint32((uint32_t )state.workspace, (uint32_t)l.t_bit_input,
new_k, n, n, new_ldb);

                /* General Matrix

Multiply函數(shù),實(shí)現(xiàn)矩陣運(yùn)算,也就是卷積運(yùn)算。*/

gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned
char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c,
n, l.mean_arr);

            }

else

            { 

im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride,
l.pad, state.workspace, l.bit_align);

// transpose B from NxK to KxN (x-axis (ldb = l.sizel.sizel.c) - should
be multiple of 8 bits)

{

int ldb_align = l.lda_align;

size_t new_ldb = k + (ldb_align - k%ldb_align);

size_t t_intput_size = binary_transpose_align_input(k, n,
state.workspace, &l.t_bit_input, ldb_align, l.bit_align);

// 5x times faster than gemm()-float32

gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned
char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c,
n, l.mean_arr);

}

            }

add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); //添加偏移項(xiàng)。

            /* 非線(xiàn)性變化,leaky RELU、Mish等激活函數(shù)。*/if

(l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch,
l.activation_input, l.output);

else if (l.activation == MISH) activate_array_mish(l.output,
l.outputs*l.batch, l.activation_input, l.output);

else if (l.activation == NORM_CHAN)
activate_array_normalize_channels(l.output, l.outputsl.batch, l.batch,
l.out_c, l.out_w
l.out_h, l.output);

else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output,
l.outputsl.batch, l.batch, l.out_c, l.out_wl.out_h, l.output, 0);

else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL)
activate_array_normalize_channels_softmax(l.output, l.outputsl.batch, l.batch,
l.out_c, l.out_w
l.out_h, l.output, 1);

else activate_array_cpu_custom(l.output, mnl.batch, l.activation);

return;

        }else {

float im = state.input + (il.groups + j)*(l.c / l.groups)l.hl.w;

            if

(l.size == 1) {

b = im;

            }

else {

im2col_cpu_ext(im, // input

                    l.c / l.groups,     // input channels

l.h, l.w, // input size
(h, w)

l.size, l.size, // kernel size
(h, w)

l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)

l.stride_y, l.stride_x, // stride (h, w)

l.dilation, l.dilation, // dilation (h, w)

b); // output

            }

gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);

            //

bit-count to float

        }}}

if(l.batch_normalize){ // BN層,加速收斂。

forward_batchnorm_layer(l, state);

}else { // 直接加上bias,output += bias。

add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

}/* 非線(xiàn)性變化,leaky RELU、Mish等激活函數(shù)。*/if

(l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch,
l.activation_input, l.output);

else if

(l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch,
l.activation_input, l.output);

else if

(l.activation == NORM_CHAN) activate_array_normalize_channels(l.output,
l.outputsl.batch, l.batch, l.out_c, l.out_wl.out_h, l.output);

else if

(l.activation == NORM_CHAN_SOFTMAX)
activate_array_normalize_channels_softmax(l.output, l.outputsl.batch, l.batch,
l.out_c, l.out_w
l.out_h, l.output, 0);

else if

(l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output,
l.outputsl.batch, l.batch, l.out_c, l.out_wl.out_h, l.output, 1);

else

activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);

if(l.binary ||

l.xnor) swap_binary(&l); // 二值化。

if(l.assisted_excitation && state.train)
assisted_excitation_forward(l, state);

if

(l.antialiasing) {

network_state s = { 0 };

    s.train =

state.train;

    s.workspace

= state.workspace;

    s.net =

state.net;

    s.input =

l.output;

    forward_convolutional_layer(*(l.input_layer),

s);

memcpy(l.output, l.input_layer->output, l.input_layer->outputs *
l.input_layer->batch * sizeof(float));

}

}

yolo層時(shí),forward_yolo_layer()函數(shù):

void forward_yolo_layer(const layer l, network_state
state)

{

int i, j, b, t,

n;

memcpy(l.output, state.input, l.outputs*l.batch * sizeof(float)); // 將層輸入直接copy到層輸出。

/* 在cpu模式,把預(yù)測(cè)輸出的x,y,confidence和所有類(lèi)別都sigmoid激活,確保值在0~1之間。*/

#ifndef GPU

for (b = 0; b

< l.batch; ++b) {

    for (n = 0;

n < l.n; ++n) {

        int

index = entry_index(l, b, nl.wl.h, 0); // 獲取第b個(gè)batch開(kāi)始的index。

        /* 對(duì)預(yù)測(cè)的tx,ty進(jìn)行邏輯回歸。*/

activate_array(l.output + index, 2 * l.w*l.h, LOGISTIC); // x,y,

scal_add_cpu(2 * l.wl.h, l.scale_x_y, -0.5(l.scale_x_y - 1), l.output

  • index, 1); // scale x,y

          index =
    

entry_index(l, b, nl.wl.h, 4); // 獲取第b個(gè)batch confidence開(kāi)始的index。

activate_array(l.output + index, (1 + l.classes)l.wl.h, LOGISTIC); // 對(duì)預(yù)測(cè)的confidence以及class進(jìn)行邏輯回歸。

    }}

#endif

// delta is

zeroed

memset(l.delta,

0, l.outputs * l.batch * sizeof(float)); // 將yolo層的誤差項(xiàng)進(jìn)行初始化(包含整個(gè)batch的)。

if

(!state.train) return; // 不是訓(xùn)練階段,return。

float tot_iou =

0; // 總的IOU。

float tot_giou

= 0;

float tot_diou

= 0;

float tot_ciou

= 0;

float

tot_iou_loss = 0;

float

tot_giou_loss = 0;

float

tot_diou_loss = 0;

float

tot_ciou_loss = 0;

float recall =

0;

float recall75

= 0;

float avg_cat =

0;

float avg_obj =

0;

float

avg_anyobj = 0;

int count = 0;int class_count

= 0;

*(l.cost) = 0;

// yolo層的總損失初始化為0。

for (b = 0; b

< l.batch; ++b) { // 遍歷batch中的每一張圖片。

    for (j = 0;

j < l.h; ++j) {

        for (i

= 0; i < l.w; ++i) { // 遍歷每個(gè)Grid
cell, 當(dāng)前cell編號(hào)[j, i]。

            for

(n = 0; n < l.n; ++n) { // 遍歷每一個(gè)bbox,當(dāng)前bbox編號(hào)[n]。

const int class_index = entry_index(l, b, nl.wl.h + jl.w + i, 4 + 1);
// 預(yù)測(cè)b-box類(lèi)別s下標(biāo)。 const int obj_index = entry_index(l, b, n
l.wl.h + jl.w + i, 4);
// 預(yù)測(cè)b-box objectness下標(biāo)。

const int box_index = entry_index(l, b, nl.wl.h + jl.w + i, 0); // 獲得第jw+i個(gè)cell第n個(gè)b-box的index。

const int stride = l.w*l.h;

                /* 計(jì)算第j*w+i個(gè)cell第n個(gè)b-box在當(dāng)前特征圖上的相對(duì)位置[x,y],在網(wǎng)絡(luò)輸入圖片上的相對(duì)寬度、高度[w,h]。*/

box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j,
l.w, l.h, state.net.w, state.net.h, l.w*l.h);

float best_match_iou = 0;

int best_match_t = 0;

float best_iou = 0; // 保存最大IOU。

int best_t = 0; // 保存最大IOU的bbox id。

for (t = 0; t < l.max_boxes; ++t) { // 遍歷每一個(gè)GT bbox。

box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths,
1); // 將第t個(gè)bbox由float數(shù)組轉(zhuǎn)bbox結(jié)構(gòu)體,方便計(jì)算IOU。

int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; // 獲取第t個(gè)bbox的類(lèi)別,檢查是否有標(biāo)注錯(cuò)誤。

if (class_id >= l.classes || class_id < 0) {

                        printf("\n

Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels
class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes -
1);

                        printf("\n

truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n",
truth.x, truth.y, truth.w, truth.h, class_id);

                        if (check_mistakes)

getchar();

                        continue; // if

label contains class_id more than number of classes in the cfg-file and
class_id check garbage value

}

if (!truth.x) break; // 如果x坐標(biāo)為0則break,因?yàn)槎x了max_boxes個(gè)b-box。

float objectness = l.output[obj_index]; // 預(yù)測(cè)bbox object置信度。

if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;

                    /* 獲得預(yù)測(cè)b-box的類(lèi)別信息,如果某個(gè)類(lèi)別的概率超過(guò)0.25返回1。*/

int class_id_match = compare_yolo_class(l.output, l.classes,
class_index, l.w*l.h, objectness, class_id, 0.25f);

float iou = box_iou(pred, truth); // 計(jì)算pred b-box與第t個(gè)GT
bbox之間的IOU。

if (iou > best_match_iou && class_id_match == 1) { //
class_id_match=1的限制,即預(yù)測(cè)b-box的置信度必須大于0.25。

                        best_match_iou = iou;best_match_t = t;

}

if (iou > best_iou) {

                        best_iou = iou; // 更新最大的IOU。best_t = t; // 記錄該GT b-box的編號(hào)t。

}

}

avg_anyobj += l.output[obj_index]; // 統(tǒng)計(jì)pred b-box的confidence。

l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]); // 將所有pred b-box都當(dāng)做noobject, 計(jì)算其confidence梯度,cls_normalizer是平衡系數(shù)。

if (best_match_iou > l.ignore_thresh) { // best_iou大于閾值則說(shuō)明pred box有物體。

const float iou_multiplier = best_match_iou*best_match_iou;//
(best_match_iou - l.ignore_thresh) / (1.0 - l.ignore_thresh);

if (l.objectness_smooth) {

                        l.delta[obj_index]

= l.cls_normalizer * (iou_multiplier - l.output[obj_index]);

                        int class_id =

state.truth[best_match_t*(4 + 1) + b*l.truths + 4];

                        if (l.map) class_id

= l.map[class_id];

                        const float

class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] :
1.0f;

                        l.delta[class_index
  • stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index
  • stride*class_id]);

}

else l.delta[obj_index] = 0;

}

else if (state.net.adversarial) { // 自對(duì)抗訓(xùn)練。

int stride = l.w*l.h;

float scale = pred.w * pred.h;

if (scale > 0) scale = sqrt(scale);

                    l.delta[obj_index] = scale *

l.cls_normalizer * (0 - l.output[obj_index]);

int cl_id;

for (cl_id = 0; cl_id < l.classes; ++cl_id) {

if(l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25)

l.delta[class_index + stride*cl_id] = scale * (0 - l.output[class_index

  • stride*cl_id]);

}

}

if (best_iou > l.truth_thresh) { // pred b-box為完全預(yù)測(cè)正確樣本,cfg中truth_thresh=1,語(yǔ)句永遠(yuǎn)不可能成立。

const float iou_multiplier = best_iou*best_iou;// (best_iou -
l.truth_thresh) / (1.0 - l.truth_thresh);

if (l.objectness_smooth) l.delta[obj_index] = l.cls_normalizer *
(iou_multiplier - l.output[obj_index]);

else l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);

int class_id = state.truth[best_t*(4 + 1) + b*l.truths + 4];

if (l.map) class_id = l.map[class_id];

delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes,
l.w*l.h, 0, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);

const float class_multiplier = (l.classes_multipliers) ?
l.classes_multipliers[class_id] : 1.0f;

if (l.objectness_smooth) l.delta[class_index + strideclass_id] =
class_multiplier * (iou_multiplier - l.output[class_index + stride
class_id]);

box truth = float_to_box_stride(state.truth + best_t*(4 + 1) +
b*l.truths, 1);

delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j,
l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.wtruth.h), l.wl.h,
l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);

}

            }}}for (t = 0;

t < l.max_boxes; ++t) { // 遍歷每一個(gè)GT box。

        box

truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1); // 將第t個(gè)b-box由float數(shù)組轉(zhuǎn)b-box結(jié)構(gòu)體,方便計(jì)算IOU。

        if

(truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 ||
truth.w < 0 || truth.h < 0) {

char buff[256];

printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f,
truth.h = %f \n", truth.x, truth.y, truth.w, truth.h);

sprintf(buff, “echo “Wrong label: truth.x = %f, truth.y = %f,
truth.w = %f, truth.h = %f” >> bad_label.list”,

truth.x, truth.y, truth.w, truth.h);

system(buff);

        }int

class_id = state.truth[t*(4 + 1) + b*l.truths + 4];

        if

(class_id >= l.classes || class_id < 0) continue; // if label contains
class_id more than number of classes in the cfg-file and class_id check garbage
value

        if

(!truth.x) break; // 如果x坐標(biāo)為0則取消,定義了max_boxes個(gè)bbox,可能實(shí)際上沒(méi)那么多。

        float

best_iou = 0; // 保存最大的IOU。

        int

best_n = 0; // 保存最大IOU的b-box
index。

        i =

(truth.x * l.w); // 獲得當(dāng)前t個(gè)GT b-box所在的cell。

        j =

(truth.y * l.h);

        box

truth_shift = truth;

truth_shift.x = truth_shift.y = 0; // 將truth_shift的box位置移動(dòng)到0,0。

        for (n

= 0; n < l.total; ++n) { // 遍歷每一個(gè)anchor
b-box找到與GT b-box最大的IOU。

            box

pred = { 0 };

pred.w = l.biases[2 * n] / state.net.w; // 計(jì)算pred b-box的w在相對(duì)整張輸入圖片的位置。

pred.h = l.biases[2 * n + 1] / state.net.h; // 計(jì)算pred bbox的h在相對(duì)整張輸入圖片的位置。

float iou = box_iou(pred, truth_shift); // 計(jì)算GT box truth_shift與預(yù)測(cè)b-box pred二者之間的IOU。

            if

(iou > best_iou) {

best_iou = iou; // 記錄最大的IOU。

best_n = n; // 記錄該b-box的編號(hào)n。

            }}int

mask_n = int_index(l.mask, best_n, l.n); // 上面記錄b-box的編號(hào),是否由該層Anchor預(yù)測(cè)的。

        if

(mask_n >= 0) {

            int

class_id = state.truth[t*(4 + 1) + b*l.truths + 4];

            if

(l.map) class_id = l.map[class_id];

            int

box_index = entry_index(l, b, mask_nl.wl.h + j*l.w + i, 0); // 獲得best_iou對(duì)應(yīng)anchor box的index。

const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id]
1.0f; // 控制樣本數(shù)量不均衡,即Focal Loss中的alpha。

ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n,
box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 -
truth.wtruth.h), l.wl.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1,
l.max_delta); // 計(jì)算best_iou對(duì)應(yīng)Anchor bbox的[x,y,w,h]的梯度。

            /* 模板檢測(cè)最新的工作,metricl learning,包括IOU/GIOU/DIOU/CIOU Loss等。*///

range is 0 <= 1

tot_iou += all_ious.iou;

            tot_iou_loss

+= 1 - all_ious.iou;

            //

range is -1 <= giou <= 1

tot_giou += all_ious.giou;

tot_giou_loss += 1 - all_ious.giou;

tot_diou += all_ious.diou;

tot_diou_loss += 1 - all_ious.diou;

tot_ciou += all_ious.ciou;

tot_ciou_loss += 1 - all_ious.ciou;

            int

obj_index = entry_index(l, b, mask_nl.wl.h + j*l.w + i, 4); // 獲得best_iou對(duì)應(yīng)anchor box的confidence的index。

avg_obj += l.output[obj_index]; // 統(tǒng)計(jì)confidence。

l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 -
l.output[obj_index]); // 計(jì)算confidence的梯度。

            int

class_index = entry_index(l, b, mask_nl.wl.h + j*l.w + i, 4 + 1); // 獲得best_iou對(duì)應(yīng)GT box的class的index。

delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes,
l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps,
l.classes_multipliers); // 獲得best_iou對(duì)應(yīng)anchor box的class的index。

++count;

++class_count;

            if

(all_ious.iou > .5) recall += 1;

            if

(all_ious.iou > .75) recall75 += 1;

        }//

iou_thresh

        for (n

= 0; n < l.total; ++n) {

            int

mask_n = int_index(l.mask, n, l.n);

            if

(mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {

box pred = { 0 };

pred.w = l.biases[2 * n] / state.net.w;

pred.h = l.biases[2 * n + 1] / state.net.h;

float iou = box_iou_kind(pred, truth_shift, l.iou_thresh_kind); // IOU,
GIOU, MSE, DIOU, CIOU

// iou, n

if (iou > l.iou_thresh) {

int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];

if (l.map) class_id = l.map[class_id];

int box_index = entry_index(l, b, mask_nl.wl.h + j*l.w + i, 0);

const float class_multiplier =
(l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;

ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index,
i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.wtruth.h),
l.w
l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);

// range is 0 <= 1

tot_iou += all_ious.iou;

tot_iou_loss += 1 - all_ious.iou;

// range is -1 <= giou <= 1

tot_giou += all_ious.giou;

tot_giou_loss += 1 - all_ious.giou;

tot_diou += all_ious.diou;

tot_diou_loss += 1 - all_ious.diou;

tot_ciou += all_ious.ciou;

tot_ciou_loss += 1 - all_ious.ciou;

int obj_index = entry_index(l, b, mask_nl.wl.h + j*l.w + i, 4);

avg_obj += l.output[obj_index];

l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 -
l.output[obj_index]);

int class_index = entry_index(l, b, mask_nl.wl.h + j*l.w + i, 4 + 1);

delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes,
l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps,
l.classes_multipliers);

++count;

++class_count;

if (all_ious.iou > .5) recall += 1;

if (all_ious.iou > .75) recall75 += 1;

}

            }}}// averages

the deltas obtained by the function: delta_yolo_box()_accumulate

    for (j = 0;

j < l.h; ++j) {

        for (i

= 0; i < l.w; ++i) {

            for

(n = 0; n < l.n; ++n) {

int box_index = entry_index(l, b, nl.wl.h + jl.w + i, 0); // 獲得第jw+i個(gè)cell第n個(gè)b-box的index。

int class_index = entry_index(l, b, nl.wl.h + jl.w + i, 4 + 1); // 獲得第jw+i個(gè)cell第n個(gè)b-box的類(lèi)別。

const int stride = l.w*l.h; // 特征圖的大小。

averages_yolo_deltas(class_index, box_index, stride, l.classes,
l.delta); // 對(duì)梯度進(jìn)行平均。

            }}}}......// gIOU loss + MSE (objectness) lossif (l.iou_loss == MSE) {*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch),

2);

  }else {// Always compute classification loss both for iou + cls

loss and for logging with mse loss

  // TODO: remove IOU loss fields before computing MSE on

class

  // probably split into two arraysif (l.iou_loss == GIOU) {avg_iou_loss = count > 0 ? l.iou_normalizer *

(tot_giou_loss / count) : 0; // 平均IOU損失,參考上面代碼,tot_iou_loss += 1 -
all_ious.iou。

  }else {avg_iou_loss = count > 0 ? l.iou_normalizer *

(tot_iou_loss / count) : 0; // 平均IOU損失,參考上面代碼,tot_iou_loss += 1 - all_ious.iou。

  }*(l.cost) = avg_iou_loss + classification_loss; // Loss值傳遞給l.cost,IOU與分類(lèi)損失求和。}loss /= l.batch; // 平均Loss。classification_loss /= l.batch;iou_loss /= l.batch;......

}

再來(lái)分析backward_network()函數(shù):

void backward_network(network net, network_state
state)

{

int i;float *original_input = state.input;float *original_delta = state.delta;state.workspace = net.workspace;for(i = net.n-1; i >= 0; --i){state.index = i;if(i == 0){state.input

= original_input;

        state.delta

= original_delta;

    }else{layer prev =

net.layers[i-1];

        state.input

= prev.output;

        state.delta

= prev.delta; // delta是指針變量,對(duì)state.delta做修改,就相當(dāng)與對(duì)prev層的delta做了修改。

    }layer l = net.layers[i];if (l.stopbackward) break;if (l.onlyforward) continue;l.backward(l, state); // 不同層l.backward代表不同函數(shù),如:convolutional_layer.c中,l.backward = backward_convolutional_layer;yolo_layer.c中,l.backward = backward_yolo_layer,CPU執(zhí)行反向傳播。}

}

卷積層時(shí),backward_convolutional_layer()函數(shù):

void backward_convolutional_layer(convolutional_layer l,
network_state state)

{

int i, j;/* m是卷積核的個(gè)數(shù),k是每個(gè)卷積核的參數(shù)數(shù)量(l.size是卷積核的大小),n是每個(gè)輸出feature map的像素個(gè)數(shù)。*/int m = l.n /

l.groups;

int n =

l.sizel.sizel.c / l.groups;

int k =

l.out_w*l.out_h;

/* 更新delta。*/if

(l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch,
l.activation_input, l.delta);

else if

(l.activation == MISH) gradient_array_mish(l.outputs*l.batch,
l.activation_input, l.delta);

else if

(l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL)
gradient_array_normalize_channels_softmax(l.output, l.outputsl.batch, l.batch,
l.out_c, l.out_w
l.out_h, l.delta);

else if

(l.activation == NORM_CHAN) gradient_array_normalize_channels(l.output,
l.outputsl.batch, l.batch, l.out_c, l.out_wl.out_h, l.delta);

else

gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);

if (l.batch_normalize)

{ // BN層,加速收斂。

backward_batchnorm_layer(l, state);

}else { // 直接加上bias。

backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

}for (i = 0; i

< l.batch; ++i) {

    for (j = 0;

j < l.groups; ++j) {

        float

a = l.delta + (il.groups + j)mk;

        float

*b = state.workspace;

        float

c = l.weight_updates + jl.nweights / l.groups;

        /* 進(jìn)入本函數(shù)之前,在backward_network()函數(shù)中,已經(jīng)將net.input賦值為prev.output,若當(dāng)前層為第l層,則net.input為第l-1層的output。*/float

im = state.input + (il.groups + j)* (l.c / l.groups)l.hl.w;

im2col_cpu_ext(

im, // input

            l.c

/ l.groups, // input channels

l.h, l.w, // input size (h, w)

l.size, l.size, // kernel size
(h, w)

l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)

l.stride_y, l.stride_x, // stride (h, w)

l.dilation, l.dilation, // dilation (h, w)

b); // output

        gemm(0,

1, m, n, k, 1, a, k, b, k, 1, c, n); // 計(jì)算當(dāng)前層weights更新。

        /* 計(jì)算上一層的delta,進(jìn)入本函數(shù)之前,在backward_network()函數(shù)中,已經(jīng)將net.delta賦值為prev.delta,若當(dāng)前層為第l層,則net.delta為第l-1層的delta。*/if

(state.delta) {

            a =

l.weights + j*l.nweights / l.groups;

            b =

l.delta + (i*l.groups + j)mk;

            c =

state.workspace;

gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);

col2im_cpu_ext(

state.workspace, // input

l.c / l.groups, // input
channels (h, w)

l.h, l.w, // input
size (h, w)

l.size, l.size, // kernel
size (h, w)

l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)

l.stride_y, l.stride_x, //
stride (h, w)

l.dilation, l.dilation, // dilation (h, w)

state.delta + (il.groups + j) (l.c / l.groups)l.hl.w); // output
(delta)

        }}}

}

yolo層時(shí),backward_yolo_layer()函數(shù):

void backward_yolo_layer(const layer l, network_state
state)

{

axpy_cpu(l.batch*l.inputs,
1, l.delta, 1, state.delta, 1); // 直接把l.delta拷貝給上一層的delta。注意 net.delta 指向 prev_layer.delta。

}

正向、反向傳播后,通過(guò)get_network_cost()函數(shù)計(jì)算Loss:

float get_network_cost(network net)

{

int i;float sum = 0;int count = 0;for(i = 0; i

< net.n; ++i){

if(net.layers[i].cost){ // 獲取各層的損失,只有detection層,也就是yolo層,有cost。

        sum +=

net.layers[i].cost[0]; // Loss總和存在cost[0]中,見(jiàn)cost_layer.c中forward_cost_layer()函數(shù)。

++count;

    }}return sum/count;

// 返回平均損失。

}

CIOU_Loss是創(chuàng)新點(diǎn),與GIOU_Loss相比,引入了重疊面積與中心點(diǎn)的距離Dis_2來(lái)區(qū)分預(yù)測(cè)框a與b的定位差異,同時(shí)還引入了預(yù)測(cè)框和目標(biāo)框的長(zhǎng)寬比一致性因子ν,將a與c這種重疊面積與中心點(diǎn)距離相同但長(zhǎng)寬比與目標(biāo)框適配程度有差異的預(yù)測(cè)框區(qū)分開(kāi)來(lái),如圖:

計(jì)算好Loss需要update_network():

void
update_network(network net)

{

int i;int update_batch = net.batch*net.subdivisions;float rate = get_current_rate(net);for(i = 0; i < net.n; ++i){layer l = net.layers[i];if(l.update){l.update(l,

update_batch, rate, net.momentum, net.decay); // convolutional_layer.c中,l.update = update_convolutional_layer。

    }}

}

update_convolutional_layer()函數(shù):

void
update_convolutional_layer(convolutional_layer l, int batch, float
learning_rate_init, float momentum, float decay)

{

float learning_rate =

learning_rate_init*l.learning_rate_scale;

axpy_cpu(l.nweights, -decay*batch,

l.weights, 1, l.weight_updates, 1); // blas.c中,axpy_cpu函數(shù)入口,for(i = 0; i < l.nweights; ++i),l.weight_updates[i1] -= decaybatchl.weights[i1]。

axpy_cpu(l.nweights, learning_rate / batch,

l.weight_updates, 1, l.weights, 1); // for(i = 0; i < l.nweights; ++i),l.weights[i*1] +=
(learning_rate/batch)l.weight_updates[i1]

scal_cpu(l.nweights, momentum,

l.weight_updates, 1); // blas.c中,scal_cpu函數(shù)入口,for(i = 0; i <
l.nweights; ++i),l.weight_updates[i*1] *= momentum。

axpy_cpu(l.n, learning_rate / batch,

l.bias_updates, 1, l.biases, 1); // for(i = 0; i < l.n; ++i),l.biases[i*1] +=
(learning_rate/batch)l.bias_updates[i1]。

scal_cpu(l.n, momentum, l.bias_updates, 1);

// for(i = 0; i < l.n; ++i),l.bias_updates[i*1] *= momentum。

if (l.scales) {axpy_cpu(l.n, learning_rate / batch,

l.scale_updates, 1, l.scales, 1);

    scal_cpu(l.n, momentum,

l.scale_updates, 1);

}

}

同樣,在network_kernels.cu里,有GPU模式下的forward&backward相關(guān)的函數(shù),涉及數(shù)據(jù)格式轉(zhuǎn)換及加速,此處只討論原理,暫時(shí)忽略GPU部分的代碼。

void
forward_backward_network_gpu(network net, float *x, float *y)

{

forward_network_gpu(net, state); // 正向。backward_network_gpu(net, state); // 反向。

}

CPU模式下,采用帶momentum的常規(guī)GD更新weights,同時(shí)在network.c中也提供了也提供了train_network_sgd()函數(shù)接口;GPU模式提供了adam選項(xiàng),convolutional_layer.c中make_convolutional_layer()函數(shù)有體現(xiàn)。

七. 調(diào)參總結(jié)

本人在實(shí)際項(xiàng)目中涉及的是工業(yè)中的鋼鐵表面缺陷檢測(cè)場(chǎng)景,不到2000張圖片,3類(lèi),數(shù)據(jù)量很少。理論上YOLO系列并不太適合缺陷檢測(cè)的問(wèn)題,基于分割+分類(lèi)的網(wǎng)絡(luò)、Cascade-RCNN等或許是更好的選擇,但我本著實(shí)驗(yàn)的態(tài)度,進(jìn)行了多輪的訓(xùn)練和對(duì)比,整體上效果還是不錯(cuò)的。

1.max_batches: AlexeyAB在github工程上有提到,類(lèi)別數(shù)*2000作為參考,不要少于6000,但這個(gè)是使用預(yù)訓(xùn)練權(quán)重的情況。如果train from scratch,要適當(dāng)增加,具體要看你的數(shù)據(jù)情況,網(wǎng)絡(luò)需要額外的時(shí)間來(lái)從零開(kāi)始學(xué)習(xí);

2.pretrain or not:當(dāng)數(shù)據(jù)量很少時(shí),預(yù)訓(xùn)練確實(shí)能更快使模型收斂,效果也不錯(cuò),但缺陷檢測(cè)這類(lèi)問(wèn)題,缺陷目標(biāo)特征本身的特異性還是比較強(qiáng)的,雖然我的數(shù)據(jù)量也很少,但scratch的方式還是能取得稍好一些的效果;

3.anchors:cfg文件默認(rèn)的anchors是基于COCO數(shù)據(jù)集,可以說(shuō)尺度比較均衡,使用它效果不會(huì)差,但如果你自己的數(shù)據(jù)在尺度分布上不太均衡,建議自行生成新的anchors,可以直接使用源碼里面的腳本,注意,要根據(jù)生成anchors的size(1-yolo:<3030,2-yolo:<6060,3-yolo:others)來(lái)改變索引值masks以及前一個(gè)conv層的filters參數(shù);

4.rotate:YOLO-V4在目標(biāo)檢測(cè)這一塊,其實(shí)沒(méi)有用到旋轉(zhuǎn)來(lái)進(jìn)行數(shù)據(jù)增強(qiáng),因此我在線(xiàn)下對(duì)數(shù)量最少的一個(gè)類(lèi)進(jìn)行了180旋轉(zhuǎn)對(duì)稱(chēng)增強(qiáng),該類(lèi)樣本數(shù)擴(kuò)增一倍,效果目前還不明顯,可能是數(shù)據(jù)量增加的還是太少,而且我還在訓(xùn)練對(duì)比,完成后可以補(bǔ)充;

5.mosaic:馬賽克數(shù)據(jù)增強(qiáng)是必須要有的,mAP值提升比較明顯,需要安裝opencv,且和cutmix不能同時(shí)使用。

Draw object:

#if defined(OPENCV) && defined(GPU)

read_data_cfg

option_find_str

get_labels_custom

load_alphabet

parse_network_cfg

parse_network_cfg_custom

set_batch_network

load_weights

load_image

resize_image

copy_image

cv_draw_object

basecfg

draw_train_chart

forward_backward_network_gpu

draw_train_loss

crop_image

copy_image_inplace

embed_image

show_image_cv

quantize_image

network_predict

save_image_png

get_network_boxes

do_nms_sort

diounms_sort

draw_detections_v3

save_image

calc_anchors

read_data_cfg

option_find_str

get_paths

list_to_array

option_find_int

replace_image_to_label

read_boxes

counter_per_class

calculating k-means++

make_matrix

do_kmeans

show_anhors

validate_detector_recall

parse_network_cfg_custom

load_weights

fuse_conv_batchnorm

load_image

resize_image

basecfg

network_predict

get_network_boxes

do_nms_obj

replace_image_to_label

read_boxes

validate_detector_map

read_data_cfg

option_find_str

get_labels_custom

read_map

remember_network_recurrent_state

free_network_recurrent_state

parse_network_cfg_custom

load_weights

fuse_conv_batchnorm

calculate_binary_weights

get_paths

list_to_array

// For multi-class precision and recall computation

load_data_in_thread

pthread_join

load_data_in_thread

basecfg

network_predict

get_network_boxes

do_nms_sort

diounms_sort

set_track_id

replace_image_to_label

read_boxes

SORT(detections)

// for PR-curve

// correct mAP calculation: ImageNet, PascalVOC
2010-2012

//add remaining area of PR curve when recall isn’t 0
at rank-1

// free memory

restore_network_recurrent_state

return mean_average_precision;

  1. train_detector

read_data_cfg

option_find_str

cuda_set_device

parse_network_cfg_custom

get_labels_custom

basecfg

cuda_set_device

parse_network_cfg

get_current_iteration

draw_train_chart

load_data

rand_scale

// at the beginning (check if enough memory) and at
the end (calc rolling mean/variance)

pthread_join

load_data

resize_network

pthread_join

load_data

float_to_box

float_to_image

train_network_waitkey

train_networks

get_current_iteration

// calculate mAP for each 4 Epochs

resize_network

copy_weights_net

// combine Training and Validation networks

draw_train_loss

sync_nets

save_weights

// free memory

//free_network(net);

  1. test_detector

read_data_cfg

option_find_str

get_labels_custom

parse_network_cfg_custom

load_weights

fuse_conv_batchnorm(net);

calculate_binary_weights(net);

letterbox_image

resize_image

get_network_boxes

do_nms_sort

diounms_sort

draw_detections_v3

save_image

show_image

detection_to_json

replace_image_to_label

// free memory

demo

parse_network_cfg_custom

load_weights

fuse_conv_batchnorm(net);

calculate_binary_weights(net);

get_capture_video_stream

get_capture_webcam

custom_create_thread

fetch_in_thread_sync

detect_in_thread_sync

create_window_cv

create_video_writer

get_time_point

custom_atomic_store_int

do_nms_obj

diounms_sort

set_track_id

send_json

send_http_post_request

draw_detections_cv_v3

max_val_cmp

send_mjpeg

write_frame_cv

this_thread_yield

release_video_writer

//free memory and thread

Coco依賴(lài)的軟件:coco.data,yolov4.cfg,yolov4.weights

duration_make_convolutional_layer:
336607

convolutional_layer make_convolutional_layer(int
batch, int steps, int h, int w, int c, int n, int groups, int size, int
stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int
batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index,
int antialiasing, convolutional_layer *share_layer, int assisted_excitation,
int deform, int train)

1) Preprocessing:Blur,antialiasing

2) cuda_make_array

3) get_convolutional_workspace_size

4) make_convolutional_layer

5) push_convolutional_layer,read_weights,

duration_run_detector_demo:
339565723

1) read_data_cfg(datacfg);

2) option_find_int(options,
“classes”, 20);

3) option_find_str(options,
“names”, “data/names.list”);

4) get_labels(name_list);

5) demo(cfg,
weights, thresh, hier_thresh, cam_index, filename, names, classes, avgframes,
frame_skip, prefix, out_filename,

mjpeg_port, dontdraw_bbox, json_port, dont_show, ext_output, letter_box,
time_limit_sec, http_post_host, benchmark, benchmark_layers);

duration_main_run_detector:
339565785

run_detector(argc, argv);

1) find_arg,
find_int_arg, find_char_arg, find_float_arg

2) test_detector,

3) train_detector,

4) validate_detector,
validate_detector_recall, validate_detector_map

5) calc_anchors

6) draw_object

7) demo(read_data_cfg,
option_find_int, option_find_str, get_labels, free_list_contents_kvp,
free_list, free(gpus))

duration_make_yolo_layer: 5110

make_yolo_layer

1) forward_yolo_layer;

activate_array,scal_add_cpu,entry_index,get_yolo_box,float_to_box_stride,compare_yolo_class,box_iou,delta_yolo_box,delta_yolo_box()_accumulate,averages_yolo_deltas,compute classification loss

2) backward_yolo_layer;
backward_yolo_layer_gpu;

axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);

axpy_ongpu(l.batch*l.inputs, state.net.loss_scale * l.delta_normalizer, l.delta_gpu,
1, state.delta, 1);

3)cuda_make_array

4)cudaHostAlloc

  1. duration_parse_network_cfg_custom:
    2007

:parse_network_cfg_custom(char *filename, int batch,
int time_steps)

1) read_cfg(filename);

2) make_network(sections->size

  • 1);

3) parse_net_options(options,
&net);

4) pre_allocate_pinned_memory((size_t)1024

  • 1024 * 1024 * 8); // pre-allocate 8
    GB CPU-RAM for pinned memory

5) parse_convolutional(options,
params);

parse_local(options, params);

parse_activation(options, params);

parse_rnn(options, params);

parse_gru(options, params);

parse_lstm(options, params);

parse_conv_lstm(options, params);

parse_history(options, params);

parse_crnn(options, params);

parse_connected(options,
params);

parse_crop(options, params);

parse_cost(options, params);

parse_region(options, params);

parse_yolo(options, params);

parse_gaussian_yolo(options, params);

parse_detection(options, params);

parse_softmax(options, params);

parse_contrastive(options, params);

parse_normalization(options, params);

parse_batchnorm(options, params);

parse_maxpool(options, params);

parse_local_avgpool(options, params);

parse_reorg(options, params);
}

parse_reorg_old(options, params);

parse_avgpool(options, params);

parse_route(options, params);

parse_upsample(options, params, net);

parse_shortcut(options, params, net);

parse_scale_channels(options, params, net);

parse_sam(options, params, net);

parse_dropout(options, params);

cuda_make_array_pinned

cuda_make_array_pinned_preallocated

set_specified_workspace_limit

cuda_make_array:cuda_pull_array_async,activate_array_ongpu

get_network_output

CHECK_CUDA

Coco依賴(lài)的軟件:coco.data,yolov4.cfg,yolov4.conv.137,trainvalueno5k.txt,train2014

read_data_cfg

option_find_str

open_valid_file

cuda_set_device

parse_network_cfg_custom

get_labels_custom

basecfg

parse_network_cfg

get_paths(train_images)

list_to_array(plist)

get_current_iteration(net)

draw_train_chart

load_data

rand_scale(rand_coef);

pthread_join

float_to_box

float_to_image

compute_loss

train_network_waitkey

train_networks

free_data

resize_network

validate_detector_map

save_weights

draw_train_loss

sync_nets

release_mat(&img);

destroy_all_windows_cv();

// free

memory

pthread_join(load_thread, 0);

free_data(buffer);

free_load_threads(&args);

free(base);free(paths);

free_list_contents(plist);

free_list(plist);

free_list_contents_kvp(options);

free_list(options);

free_network;

free(nets);

free_network(net_map);

Makefile

GPU=0

CUDNN=0

CUDNN_HALF=0

OPENCV=0

AVX=0

OPENMP=0

LIBSO=0

ZED_CAMERA=0

ZED_CAMERA_v2_8=0

set GPU=1 and CUDNN=1 to speedup on GPU

set CUDNN_HALF=1 to further speedup 3 x times

(Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher

set AVX=1 and OPENMP=1 to speedup on CPU (if error

occurs then set AVX=0)

set ZED_CAMERA=1 to enable ZED SDK 3.0 and above

set ZED_CAMERA_v2_8=1 to enable ZED SDK 2.X

USE_CPP=0

DEBUG=0

ARCH= -gencode arch=compute_30,code=sm_30 \

  -gencode

arch=compute_35,code=sm_35 \

  -gencode

arch=compute_50,code=[sm_50,compute_50] \

  -gencode

arch=compute_52,code=[sm_52,compute_52] \

    -gencode

arch=compute_61,code=[sm_61,compute_61]

OS := $(shell uname)

Tesla A100 (GA100), DGX-A100, RTX 3080

ARCH= -gencode

arch=compute_80,code=[sm_80,compute_80]

Tesla V100

ARCH= -gencode

arch=compute_70,code=[sm_70,compute_70]

GeForce RTX 2080 Ti, RTX 2080, RTX 2070, Quadro RTX

8000, Quadro RTX 6000, Quadro RTX 5000, Tesla T4, XNOR Tensor Cores

ARCH= -gencode

arch=compute_75,code=[sm_75,compute_75]

Jetson XAVIER

ARCH= -gencode

arch=compute_72,code=[sm_72,compute_72]

GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030,

Titan Xp, Tesla P40, Tesla P4

ARCH= -gencode arch=compute_61,code=sm_61 -gencode

arch=compute_61,code=compute_61

GP100/Tesla P100 - DGX-1

ARCH= -gencode arch=compute_60,code=sm_60

For Jetson TX1, Tegra X1, DRIVE CX, DRIVE PX -

uncomment:

ARCH= -gencode

arch=compute_53,code=[sm_53,compute_53]

For Jetson Tx2 or Drive-PX2 uncomment:

ARCH= -gencode

arch=compute_62,code=[sm_62,compute_62]

VPATH=./src/

EXEC=darknet

OBJDIR=./obj/

ifeq ($(LIBSO), 1)

LIBNAMESO=libdarknet.so

APPNAMESO=uselib

endif

ifeq ($(USE_CPP), 1)

CC=g++

else

CC=gcc

endif

CPP=g++ -std=c++11

NVCC=nvcc

OPTS=-Ofast

LDFLAGS= -lm -pthread

COMMON= -Iinclude/ -I3rdparty/stb/include

CFLAGS=-Wall -Wfatal-errors -Wno-unused-result
-Wno-unknown-pragmas -fPIC

ifeq ($(DEBUG), 1)

#OPTS= -O0 -g

#OPTS= -Og -g

COMMON+= -DDEBUG

CFLAGS+= -DDEBUG

else

ifeq ($(AVX), 1)

CFLAGS+= -ffp-contract=fast -mavx -mavx2 -msse3
-msse4.1 -msse4.2 -msse4a

endif

endif

CFLAGS+=$(OPTS)

ifneq (,(findstringMSYSNT,(findstring MSYS_NT,(findstringMSYSN?T,(OS)))

LDFLAGS+=-lws2_32

endif

ifeq ($(OPENCV), 1)

COMMON+= -DOPENCV

CFLAGS+= -DOPENCV

LDFLAGS+= pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv

COMMON+= pkg-config --cflags opencv4 2> /dev/null || pkg-config --cflags opencv

endif

ifeq ($(OPENMP), 1)

ifeq

($(OS),Darwin) #MAC

    CFLAGS+= -Xpreprocessor -fopenmpelseCFLAGS+=

-fopenmp

endif

LDFLAGS+= -lgomp

endif

ifeq ($(GPU), 1)

COMMON+= -DGPU -I/usr/local/cuda/include/

CFLAGS+= -DGPU

ifeq ($(OS),Darwin) #MAC

LDFLAGS+= -L/usr/local/cuda/lib -lcuda -lcudart
-lcublas -lcurand

else

LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart
-lcublas -lcurand

endif

endif

ifeq ($(CUDNN), 1)

COMMON+= -DCUDNN

ifeq ($(OS),Darwin) #MAC

CFLAGS+= -DCUDNN -I/usr/local/cuda/include

LDFLAGS+= -L/usr/local/cuda/lib -lcudnn

else

CFLAGS+= -DCUDNN -I/usr/local/cudnn/include

LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn

endif

endif

ifeq ($(CUDNN_HALF), 1)

COMMON+= -DCUDNN_HALF

CFLAGS+= -DCUDNN_HALF

ARCH+= -gencode
arch=compute_70,code=[sm_70,compute_70]

endif

ifeq ($(ZED_CAMERA), 1)

CFLAGS+= -DZED_STEREO -I/usr/local/zed/include

ifeq ($(ZED_CAMERA_v2_8), 1)

LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input
-lsl_zed

#-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0

else

LDFLAGS+= -L/usr/local/zed/lib -lsl_zed

#-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0

endif

endif

OBJ=image_opencv.o http_stream.o gemm.o utils.o
dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o
col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o
data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o
darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o
normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o
compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o
rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o
batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o
voxel.o tree.o yolo_layer.o gaussian_yolo_layer.o upsample_layer.o lstm_layer.o
conv_lstm_layer.o scale_channels_layer.o sam_layer.o

ifeq ($(GPU), 1)

LDFLAGS+= -lstdc++

OBJ+=convolutional_kernels.o activation_kernels.o
im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o
dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o
avgpool_layer_kernels.o

endif

OBJS = $(addprefix $(OBJDIR), $(OBJ))

DEPS = $(wildcard src/*.h) Makefile include/darknet.h

all: $(OBJDIR) backup results setchmod $(EXEC)
$(LIBNAMESO) $(APPNAMESO)

ifeq ($(LIBSO), 1)

CFLAGS+= -fPIC

$(LIBNAMESO): $(OBJDIR) $(OBJS) include/yolo_v2_class.hpp
src/yolo_v2_class.cpp

$(CPP)

-shared -std=c++11 -fvisibility=hidden -DLIB_EXPORTS $(COMMON) $(CFLAGS)
$(OBJS) src/yolo_v2_class.cpp -o $@ $(LDFLAGS)

$(APPNAMESO): $(LIBNAMESO) include/yolo_v2_class.hpp
src/yolo_console_dll.cpp

$(CPP)

-std=c++11 $(COMMON) $(CFLAGS) -o $@ src/yolo_console_dll.cpp (LDFLAGS)?L./?l:(LDFLAGS) -L ./ -l:(LDFLAGS)?L./?l:(LIBNAMESO)

endif

$(EXEC): $(OBJS)

$(CPP)

-std=c++11 $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)

$(OBJDIR)%.o: %.c $(DEPS)

$(CC)

$(COMMON) $(CFLAGS) -c $< -o $@

$(OBJDIR)%.o: %.cpp $(DEPS)

$(CPP)

-std=c++11 $(COMMON) $(CFLAGS) -c $< -o $@

$(OBJDIR)%.o: %.cu $(DEPS)

$(NVCC)

$(ARCH) (COMMON)??compiler?options"(COMMON) --compiler-options "(COMMON)??compiler?options"(CFLAGS)" -c $< -o $@

$(OBJDIR):

mkdir -p

$(OBJDIR)

backup:

mkdir -p

backup

results:

mkdir -p results

setchmod:

chmod +x *.sh

.PHONY: clean

clean:

rm -rf

$(OBJS) $(EXEC) $(LIBNAMESO) $(APPNAMESO)

SPP結(jié)構(gòu)

總結(jié)

以上是生活随笔為你收集整理的Yolov4性能分析(下)的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。

如果覺(jué)得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。