C++多线程rknn_ssd例程

[复制链接] · 发表于 2019-4-18 16:14:34

本帖最后由 momo 于 2019-4-23 14:01 编辑

本例程修改自rk官方提供的rknn_sdd.cpp，处理的数据流来自usb-camera，帧率在25fps
20190416：添加本地视频读取功能，640x480的视频流，帧率可达50fps
               支持线程绑定CPU，两个大核用于NPU深度学习处理

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <fstream>
#include <iostream>
#include <fstream>
#include <atomic>
#include <queue>
#include <thread>
#include <mutex>
#include <chrono>
#include <sys/time.h>
#include <sys/stat.h>
#include <dirent.h>
#include <unistd.h>

#include "rknn_api.h"
#include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <opencv2/opencv.hpp>

#include <unistd.h>
#include <sys/syscall.h>
inline pid_t gettid()
{
  return syscall(__NR_gettid);
}

using namespace std;
using namespace cv;
using namespace std::chrono;

#define NUM_RESULTS       1917
#define NUM_CLASSES       91

#define Y_SCALE  10.0f
#define X_SCALE  10.0f
#define H_SCALE  5.0f
#define W_SCALE  5.0f

#define __AVE_TIC__(tag) static int ____##tag##_total_time=0; \
      static int ____##tag##_total_conut=0;\
      timeval ____##tag##_start_time, ____##tag##_end_time;\
      gettimeofday(&____##tag##_start_time, 0);

#define __AVE_TOC__(tag) gettimeofday(&____##tag##_end_time, 0); \
      ____##tag##_total_conut++; \
      ____##tag##_total_time+=((int)____##tag##_end_time.tv_sec-(int)____##tag##_start_time.tv_sec)*1000000+((int)____##tag##_end_time.tv_usec-(int)____##tag##_start_time.tv_usec); \
      fprintf(stderr,  #tag ": %d us\n", ____##tag##_total_time/____##tag##_total_conut);

#define __TIC__(tag) timeval ____##tag##_start_time, ____##tag##_end_time;\
      gettimeofday(&____##tag##_start_time, 0);

#define __TOC__(tag) gettimeofday(&____##tag##_end_time, 0); \
      int ____##tag##_total_time=((int)____##tag##_end_time.tv_sec-(int)____##tag##_start_time.tv_sec)*1000000+((int)____##tag##_end_time.tv_usec-(int)____##tag##_start_time.tv_usec); \
      fprintf(stderr,  #tag ": %d us\n", ____##tag##_total_time);

int idxInputImage = 0;  // image index of input video
int idxShowImage = 0; // next frame index to be display
bool bReading = true; // flag of input
chrono::system_clock::time_point start_time;

typedef pair<int, Mat> imagePair;
class paircomp {
public:
bool operator()(const imagePair &n1, const imagePair &n2) const {
      if (n1.first == n2.first) return n1.first > n2.first;
      return n1.first > n2.first;
}
};

mutex mtxQueueInput;             // mutex of input queue
mutex mtxQueueShow;             // mutex of display queue
queue<pair<int, Mat>> queueInput;  // input queue
priority_queue<imagePair, vector<imagePair>, paircomp>
      queueShow;  // display queue

#ifdef SHOWTIME
#define _T(func)                                                             \
{                                                                      \
      auto _start = system_clock::now();                                  \
      func;                                                                \
      auto _end = system_clock::now();                                     \
      auto duration = (duration_cast<microseconds>(_end - _start)).count(); \
      string tmp = #func;                                                 \
      tmp = tmp.substr(0, tmp.find('('));                                  \
      cout << "[TimeTest]" << left << setw(30) << tmp;                   \
      cout << left << setw(10) << duration << "us" << endl;                \
}
#else
#define _T(func) func;
#endif

Scalar colorArray[10] = {
      Scalar(139, 0, 0, 255),
      Scalar(139, 0, 139, 255),
      Scalar(  0, 0, 139, 255),
      Scalar(  0, 100, 0, 255),
      Scalar(139, 139, 0, 255),
      Scalar(209, 206, 0, 255),
      Scalar(  0, 127, 255, 255),
      Scalar(139,  61,  72, 255),
      Scalar(  0, 255, 0, 255),
      Scalar(255, 0, 0, 255),
};

float MIN_SCORE = 0.4f;

float NMS_THRESHOLD = 0.45f;

int multi_npu_process_initialized[2] = {0, 0};

int loadLabelName(string locationFilename, string* labels) {
ifstream fin(locationFilename);
string line;
int lineNum = 0;
while(getline(fin, line))
{
      labels[lineNum] = line;
      lineNum++;
}
return 0;
}

int loadCoderOptions(string locationFilename, float (*boxPriors)[NUM_RESULTS])
{
const char *d = ", ";
ifstream fin(locationFilename);
string line;
int lineNum = 0;
while(getline(fin, line))
{
      char *line_str = const_cast<char *>(line.c_str());
      char *p;
      p = strtok(line_str, d);
      int priorIndex = 0;
      while (p) {
         float number = static_cast<float>(atof(p));
         boxPriors[lineNum][priorIndex++] = number;
         p=strtok(nullptr, d);
      }
      if (priorIndex != NUM_RESULTS) {
         return -1;
      }
      lineNum++;
}
return 0;

}

float CalculateOverlap(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1, float ymax1) {
float w = max(0.f, min(xmax0, xmax1) - max(xmin0, xmin1));
float h = max(0.f, min(ymax0, ymax1) - max(ymin0, ymin1));
float i = w * h;
float u = (xmax0 - xmin0) * (ymax0 - ymin0) + (xmax1 - xmin1) * (ymax1 - ymin1) - i;
return u <= 0.f ? 0.f : (i / u);
}

float expit(float x) {
return (float) (1.0 / (1.0 + exp(-x)));
}

void decodeCenterSizeBoxes(float* predictions, float (*boxPriors)[NUM_RESULTS]) {

for (int i = 0; i < NUM_RESULTS; ++i) {
      float ycenter = predictions[i*4+0] / Y_SCALE * boxPriors[2] + boxPriors[0];
      float xcenter = predictions[i*4+1] / X_SCALE * boxPriors[3] + boxPriors[1];
      float h = (float) exp(predictions[i*4 + 2] / H_SCALE) * boxPriors[2];
      float w = (float) exp(predictions[i*4 + 3] / W_SCALE) * boxPriors[3];

      float ymin = ycenter - h / 2.0f;
      float xmin = xcenter - w / 2.0f;
      float ymax = ycenter + h / 2.0f;
      float xmax = xcenter + w / 2.0f;

      predictions[i*4 + 0] = ymin;
      predictions[i*4 + 1] = xmin;
      predictions[i*4 + 2] = ymax;
      predictions[i*4 + 3] = xmax;
}
}

int scaleToInputSize(float * outputClasses, int (*output)[NUM_RESULTS], int numClasses)
{
int validCount = 0;
// Scale them back to the input size.
for (int i = 0; i < NUM_RESULTS; ++i) {
      float topClassScore = static_cast<float>(-1000.0);
      int topClassScoreIndex = -1;

      // Skip the first catch-all class.
      for (int j = 1; j < numClasses; ++j) {
         float score = expit(outputClasses[i*numClasses+j]);
         if (score > topClassScore) {
            topClassScoreIndex = j;
            topClassScore = score;
         }
      }

      if (topClassScore >= MIN_SCORE) {
         output[0][validCount] = i;
         output[1][validCount] = topClassScoreIndex;
         ++validCount;
      }
}

return validCount;
}

int nms(int validCount, float* outputLocations, int (*output)[NUM_RESULTS])
{
for (int i=0; i < validCount; ++i) {
      if (output[0] == -1) {
         continue;
      }
      int n = output[0];
      for (int j=i + 1; j<validCount; ++j) {
         int m = output[0][j];
         if (m == -1) {
            continue;
         }
         float xmin0 = outputLocations[n*4 + 1];
         float ymin0 = outputLocations[n*4 + 0];
         float xmax0 = outputLocations[n*4 + 3];
         float ymax0 = outputLocations[n*4 + 2];

         float xmin1 = outputLocations[m*4 + 1];
         float ymin1 = outputLocations[m*4 + 0];
         float xmax1 = outputLocations[m*4 + 3];
         float ymax1 = outputLocations[m*4 + 2];

         float iou = CalculateOverlap(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);

         if (iou >= NMS_THRESHOLD) {
            output[0][j] = -1;
         }
      }
}

return 0;
}

void cameraRead(int index)
{
      int i = 0;
  int initialization_finished = 1;
  cpu_set_t mask;
  int cpuid = 2;

  CPU_ZERO(&mask);
  CPU_SET(cpuid, &mask);

  if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
cerr << "set thread affinity failed" << endl;

  printf("Bind CameraCapture process to CPU %d\n", cpuid);

      VideoCapture camera(index);
      if (!camera.isOpened()) {
            cerr << "Open camera error!" << endl;
            exit(-1);
      }

  camera.set(3, 640);
  camera.set(4, 480);

  while (true) {
initialization_finished = 1;

for (int i = 0; i < sizeof(multi_npu_process_initialized) / sizeof(int); i++) {
   //cout << i << " " << multi_npu_process_initialized << endl;
   if (multi_npu_process_initialized == 0) {
      initialization_finished = 0;
      //break;
   }
}

if (initialization_finished)
   break;

sleep(1);
  }

  start_time = chrono::system_clock::now();
      while (true) {
            // read function
            usleep(20000);
            Mat img;
            camera >> img;
            if (img.empty()) {
                     cerr << "Fail to read image from camera!" << endl;
                     break;
            }

            mtxQueueInput.lock();
            queueInput.push(make_pair(idxInputImage++, img));
            if (queueInput.size() >= 30) {
                     mtxQueueInput.unlock();
                     cout << "[Warning]input queue size is " << queueInput.size() << endl;
                     sleep(1);
            } else {
                     mtxQueueInput.unlock();
            }
  }
}

void videoRead(const char *video_name)
{
      int i = 0;
  int initialization_finished = 1;
  int cpuid = 2;
  cpu_set_t mask;

  CPU_ZERO(&mask);
  CPU_SET(cpuid, &mask);

  if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
cerr << "set thread affinity failed" << endl;

  printf("Bind VideoCapture process to CPU %d\n", cpuid);

      VideoCapture video;

      if (!video.open(video_name)) {
            cout << "Fail to open " << video_name << endl;
            return;
      }

  int frame_cnt = video.get(CV_CAP_PROP_FRAME_COUNT);

  while (true) {
initialization_finished = 1;

for (int i = 0; i < sizeof(multi_npu_process_initialized) / sizeof(int); i++) {
   //cout << i << " " << multi_npu_process_initialized << endl;
   if (multi_npu_process_initialized == 0) {
      initialization_finished = 0;
      //break;
   }
}

if (initialization_finished)
   break;

sleep(1);
  }

  start_time = chrono::system_clock::now();
      while (true)
  {
      usleep(1000);
Mat img;

            if (queueInput.size() < 30) {
   if (!video.read(img)) {
      cout << "read video stream failed!" << endl;
      return;
   }
   mtxQueueInput.lock();
   queueInput.push(make_pair(idxInputImage++, img));
                     mtxQueueInput.unlock();

   if (idxInputImage >= frame_cnt)
      break;
            } else {
                     usleep(10);
            }
  }
}

void displayImage() {
  Mat img;
  cpu_set_t mask;
  int cpuid = 3;

  CPU_ZERO(&mask);
  CPU_SET(cpuid, &mask);

  if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
cerr << "set thread affinity failed" << endl;

  printf("Bind Display process to CPU %d\n", cpuid);

while (true) {
      mtxQueueShow.lock();
      if (queueShow.empty()) {
         mtxQueueShow.unlock();
         usleep(10);
      } else if (idxShowImage == queueShow.top().first) {
         auto show_time = chrono::system_clock::now();
         stringstream buffer;
         Mat img = queueShow.top().second;
         auto dura = (duration_cast<microseconds>(show_time - start_time)).count();
         buffer << fixed << setprecision(2)
               << (float)queueShow.top().first / (dura / 1000000.f);
         string a = buffer.str() + "FPS";
         cv::putText(img, a, cv:oint(15, 15), 1, 1, cv::Scalar{0, 0, 255},2);
         cv::imshow("RK3399Pro", img);  // display image
         idxShowImage++;
         queueShow.pop();
         mtxQueueShow.unlock();
         if (waitKey(1) == 'q') {
            bReading = false;
            exit(0);
         }
      } else {
         mtxQueueShow.unlock();
      }
}
}

void run_process(int thread_id)
{
  const char *model_path = "/tmp/mobilenet_ssd.rknn";
  const char *label_path = "/tmp/coco_labels_list.txt";
  const char *box_priors_path = "/tmp/box_priors.txt";
  const int img_width = 300;
  const int img_height = 300;
  const int img_channels = 3;
  const int input_index = 0;    // node name "reprocessor/sub"

  const int output_elems1 = NUM_RESULTS * 4;
  const uint32_t output_size1 = output_elems1 * sizeof(float);
  const int output_index1 = 0; // node name "concat"

  const int output_elems2 = NUM_RESULTS * NUM_CLASSES;
  const uint32_t output_size2 = output_elems2 * sizeof(float);
  const int output_index2 = 1; // node name "concat_1"

  cv::Mat resimg;

  cpu_set_t mask;
  int cpuid = 0;

  if (thread_id == 0)
cpuid = 4;
  else if (thread_id == 1)
cpuid = 5;
  else
cpuid = 0;

  CPU_ZERO(&mask);
  CPU_SET(cpuid, &mask);

  if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
cerr << "set thread affinity failed" << endl;

  printf("Bind NPU process(%d) to CPU %d\n", thread_id, cpuid);

  FILE *fp = fopen(model_path, "rb");
  if(fp == NULL) {
printf("fopen %s fail!\n", model_path);
return;
  }
  fseek(fp, 0, SEEK_END);

  int model_len = ftell(fp);
  void *model = malloc(model_len);
  fseek(fp, 0, SEEK_SET);
  if(model_len != fread(model, 1, model_len, fp)) {
printf("fread %s fail!\n", model_path);
free(model);
return;
  }

  // Start Inference
  rknn_input inputs[1];
  rknn_output outputs[2];
  rknn_tensor_attr outputs_attr[2];

  int ret = 0;
  rknn_context ctx = 0;

  ret = rknn_init(&ctx, model, model_len, RKNN_FLAG_PRIOR_MEDIUM);
  if(ret < 0) {
printf("rknn_init fail! ret=%d\n", ret);
return;
  }

  outputs_attr[0].index = 0;
  ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(outputs_attr[0]), sizeof(outputs_attr[0]));
  if(ret < 0) {
printf("rknn_query fail! ret=%d\n", ret);
return;
  }

  outputs_attr[1].index = 1;
  ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(outputs_attr[1]), sizeof(outputs_attr[1]));
  if(ret < 0) {
printf("rknn_query fail! ret=%d\n", ret);
return;
  }

  if (thread_id > sizeof(multi_npu_process_initialized) / sizeof(int) - 1)
return;

  multi_npu_process_initialized[thread_id] = 1;
  cout << "The initialization of NPU Process " << thread_id << " has finished." << endl;

  while (true) {
pair<int, Mat> pairIndexImage;
mtxQueueInput.lock();
if (queueInput.empty()) {
   mtxQueueInput.unlock();
   if (bReading)
      continue;
   else
      break;
   } else {
      // Get an image from input queue
      pairIndexImage = queueInput.front();
      queueInput.pop();
      mtxQueueInput.unlock();
   }

   cv::resize(pairIndexImage.second, resimg, cv::Size(img_width, img_height), (0, 0), (0, 0), cv::INTER_LINEAR);

   inputs[0].index = input_index;
   inputs[0].buf = resimg.data;
   inputs[0].size = img_width * img_height * img_channels;
   inputs[0].pass_through = false;
   inputs[0].type = RKNN_TENSOR_UINT8;
   inputs[0].fmt = RKNN_TENSOR_NHWC;
   ret = rknn_inputs_set(ctx, 1, inputs);
   if(ret < 0) {
      printf("rknn_input_set fail! ret=%d\n", ret);
      return;
   }

   ret = rknn_run(ctx, nullptr);
   if(ret < 0) {
      printf("rknn_run fail! ret=%d\n", ret);
      return;
   }

   outputs[0].want_float = true;
   outputs[0].is_prealloc = false;
   outputs[1].want_float = true;
   outputs[1].is_prealloc = false;
   ret = rknn_outputs_get(ctx, 2, outputs, nullptr);
   if(ret < 0) {
      printf("rknn_outputs_get fail! ret=%d\n", ret);
      return;
   }
   if(outputs[0].size == outputs_attr[0].n_elems*sizeof(float) && outputs[1].size == outputs_attr[1].n_elems*sizeof(float))
   {
      float boxPriors[4][NUM_RESULTS];
      string labels[91];

      /* load label and boxPriors */
      loadLabelName(label_path, labels);
      loadCoderOptions(box_priors_path, boxPriors);

      float* predictions = (float*)outputs[0].buf;
      float* outputClasses = (float*)outputs[1].buf;

      int output[2][NUM_RESULTS];

      /* transform */
      decodeCenterSizeBoxes(predictions, boxPriors);

      int validCount = scaleToInputSize(outputClasses, output, NUM_CLASSES);
      //printf("validCount: %d\n", validCount);

      if (validCount < 100) {
            /* detect nest box */
            nms(validCount, predictions, output);

            /* box valid detect target */
            for (int i = 0; i < validCount; ++i) {
               if (output[0] == -1) {
                  continue;
               }
               int n = output[0];
               int topClassScoreIndex = output[1];

               int x1 = static_cast<int>(predictions[n * 4 + 1] * pairIndexImage.second.cols);
               int y1 = static_cast<int>(predictions[n * 4 + 0] * pairIndexImage.second.rows);
               int x2 = static_cast<int>(predictions[n * 4 + 3] * pairIndexImage.second.cols);
               int y2 = static_cast<int>(predictions[n * 4 + 2] * pairIndexImage.second.rows);

               string label = labels[topClassScoreIndex];

               //std::cout << label << "\t@ (" << x1 << ", " << y1 << ") (" << x2 << ", " << y2 << ")" << "\n";

               rectangle(pairIndexImage.second, Point(x1, y1), Point(x2, y2), colorArray[topClassScoreIndex%10], 2);
               putText(pairIndexImage.second, label, Point(x1, y1 - 12), 1, 1, Scalar(0, 255, 0, 255));
            }
      } else {
            printf("validCount too much!\n");
      }
   }
   else
   {
      printf("rknn_outputs_get fail! get outputs_size = [%d, %d], but expect [%lu, %lu]!\n",
            outputs[0].size, outputs[1].size, outputs_attr[0].n_elems*sizeof(float), outputs_attr[1].n_elems*sizeof(float));
   }
   rknn_outputs_release(ctx, 2, outputs);

   mtxQueueShow.lock();
   // Put the processed iamge to show queue
   queueShow.push(pairIndexImage);
   mtxQueueShow.unlock();
  }
}

int main(const int argc, const char** argv)
{
  int i, cpus = 0;
  int camera_index;
  cpu_set_t mask;
  cpu_set_t get;
  array<thread, 4> threads;

  if (argc != 3) {
cout << "Usage of this exec: ./rknn_ssd c camera_index" << endl;
cout << "                   ./rknn_ssd v video_name" << endl;
return -1;
  }

  string model = argv[1];
  if (model == "c") {
camera_index = atoi(argv[2]);
threads = {thread(cameraRead, camera_index),
                              thread(displayImage),
                              thread(run_process, 0),
                              thread(run_process, 1)};
  } else if (model == "v") {
threads = {thread(videoRead, argv[2]),
                              thread(displayImage),
                              thread(run_process, 0),
                              thread(run_process, 1)};
  } else {
return -1;
  }

  cpus = sysconf(_SC_NPROCESSORS_CONF);
  cout << "This system has " << cpus << " processor(s)" <<endl;

  for (int i = 0; i < 4; i++)
threads.join();

  return 0;
}

只看该作者 · 发表于 2019-4-18 16:37:26

rknn-api自带的opencv库不支持imshow，因此需要手动在线安装opencv库，才能观看实时检测效果

只看该作者 · 发表于 2019-4-18 16:40:17

赞一个。用openmp写起来应该效果更好，最好还可以设置一下CPU的亲和性，把费时的操作做到大核上，不费时的用小核去做

只看该作者 · 发表于 2019-4-18 16:44:26

q5671229 发表于 2019-4-18 16:40
赞一个。用openmp写起来应该效果更好，最好还可以设置一下CPU的亲和性，把费时的操作做到大核上，不费时的 ...

这个建议不错，后续尝试一下

只看该作者 · 发表于 2019-4-19 14:41:38

理论上帧率还能更高，25帧猜测是usb camera输出有限制；

只看该作者 · 发表于 2019-4-19 15:10:16

leok 发表于 2019-4-19 14:41
理论上帧率还能更高，25帧猜测是usb camera输出有限制；

是的，测试本地640x480的avi视频，帧率可达50fps

只看该作者 · 发表于 2019-4-19 15:29:42

非常不错，感谢感谢！

只看该作者 · 发表于 2019-4-27 08:37:28

q5671229 发表于 2019-4-18 16:40
赞一个。用openmp写起来应该效果更好，最好还可以设置一下CPU的亲和性，把费时的操作做到大核上，不费时的 ...

openmp似乎并不是很理想，我在查找资料的时候看到这么一篇：
http://www.icxbk.com/article/detail/222.html

只看该作者 · 发表于 2019-6-28 18:05:17

momo 发表于 2019-4-18 16:37
rknn-api自带的opencv库不支持imshow，因此需要手动在线安装opencv库，才能观看实时检测效果 ...

麻烦请问一下，该怎么手动在线安装opencv库啊，是yum install opencv opencv-dev吗，我安装完了，但还是不知道该怎么调用

只看该作者 · 发表于 2019-7-12 11:17:10

vanilla 发表于 2019-6-28 18:05
麻烦请问一下，该怎么手动在线安装opencv库啊，是yum install opencv opencv-dev吗，我安装完了，但还是 ...

下载opencv源码，然后编译安装

C++多线程rknn_ssd例程

本帖子中包含更多资源