How to Create Computer Vision Component

In section we will demonstrate how to create a component for detecting bounding boxes using an AI model.

Steps to create Object Detection Component

Component Configuration

Each function of the component needs to be defined in configuration:

The component takes Image and outputs list of BoundingBox

name: "Basic Detect Objects"
language: cpp
platform: linux/amd64
# extended to include additional CV libraries
build_system: 1.1.3-extended
tags: ["default", "latest"]
worker:
  input_type: "Image"
  output_type: "[BoundingBox]"
  file_schema:
    model:
     file_type: "model"
     file_slot: ""
     config_key: "model_name"
     deployable: "triton"
     is_optional: false  
  config_schema:
    # Which classes to perform detections for. If empty, detects all classes.
    classes:
      type: "[Int64]"
      default: []

    rescale_factor:
      type: Double
      default: 1.0

    # Threshold for confidence
    object_confidence_threshold:
      type: Double
      default: 0.4

    # Non maximum suppression threshold for detections.
    object_nms_threshold:
      type: Double
      default: 0.6

    classes_nms_equivalence:
      type: "[[Int64]]"
      default: []

    object_scale_factor:
      type: Double
      default: 1.3

    model_height:
      type: UInt64

    model_width:
      type: UInt64

    top_detections:
      type: UInt64
      default: 100

    color_model:
      type: "Image.RGB | Image.BGR | Image.GRAY"
      default: "Image.RGB ()"
      
    mean:
      type: "(Double, Double, Double)"

    std:
      type: "(Double, Double, Double)"

    input_layer:
      type: "String"

    output_label_layer:
      type: "String"

    output_detections_layer:
      type: "String"

    triton_address:
      type: "String"

    triton_port:
      type: "String"

depends_on:
  - service: triton
    interface: inference

Parse Configuration Parameters

The configuration parameters from the previous step needs to be parsed into C++

How to Parse parameters

struct Params {
  std::string triton_address;
  std::string triton_port;
  std::string model_name;
  std::string input_name;
  std::string output_detections;
  std::string output_label;
  int model_height;
  int model_width;
  int top_detections;
  cv::Scalar mean;
  cv::Scalar std;
  cv::Scalar rescale_factor;
  bml::detect::ObjectDetectionParameters detection_params;
  
  Params(const std::string &proto_filename) {
    const std::map<std::string, Object<Any>> config =
        bml::stream::parse_config(proto_filename);

    model_name = std::string(
           bml::stream::read_config(config, "model_name").as<String>());

    input_name = std::string(bml::stream::read_config(config, "input_layer").as<String>());
    output_detections = std::string(bml::stream::read_config(config, "output_detections_layer").as<String>());
    
    output_label = std::string(bml::stream::read_config(config, "output_label_layer").as<String>());
    
    double rescale_val =
      bml::stream::read_config(config, "rescale_factor").as<Double>();

    rescale_factor =
      cv::Scalar(1 / rescale_val, 1 / rescale_val, 1 / rescale_val);
    
    triton_address = std::string(bml::stream::read_config(config, "triton_address").as<String>());
    triton_port  = std::string(bml::stream::read_config(config, "triton_port").as<String>());

    top_detections =
      bml::stream::read_config(config, "top_detections").as<UInt64>();

    ConstReference<List<List<Int64>>> nms_eq_classes =
        bml::stream::read_config(config, "classes_nms_equivalence")
            .as<List<List<Int64>>>();
    for (int i = 0; i < nms_eq_classes.size(); i++) {
      ConstReference<List<Int64>> eq_class = nms_eq_classes.data(i);
      std::set<int> s{};
      for (int j = 0; j < eq_class.size(); j++) {
        s.insert(eq_class.data(j));
      }
      detection_params.classes_nms_equivalence.insert(std::move(s));
    }
    
    detection_params.confidence_threshold =
        bml::stream::read_config(config, "object_confidence_threshold")
            .as<Double>();
    detection_params.nms_threshold =
        bml::stream::read_config(config, "object_nms_threshold").as<Double>();
    detection_params.scale_factor =
        bml::stream::read_config(config, "object_scale_factor").as<Double>();

    model_height = bml::stream::read_config(config, "model_height").as<UInt64>();
    model_width = bml::stream::read_config(config, "model_width").as<UInt64>();

    ConstReference<Tuple<Double, Double, Double>> mean_setup =
        bml::stream::read_config(config, "mean")
            .as<Tuple<Double, Double, Double>>();
    ConstReference<Tuple<Double, Double, Double>> std_setup =
        bml::stream::read_config(config, "std")
            .as<Tuple<Double, Double, Double>>();

    mean = cv::Scalar(mean_setup.get<0>(), mean_setup.get<1>(),
                        mean_setup.get<2>());
    std = cv::Scalar(std_setup.get<0>(), std_setup.get<1>(),
                       std_setup.get<2>());
  }
}

Define Input Types

Define Image in C++, so it can be converted into variables

C++ Definition

using BGR = Named<"Image.BGR", Tuple<>>;
using BGRA = Named<"Image.BGRA", Tuple<>>;
using GRAY = Named<"Image.GRAY", Tuple<>>;
using RGB = Named<"Image.RGB", Tuple<>>;
using RGBA = Named<"Image.RGBA", Tuple<>>;
using Formats = ppl::pipelang::Union<BGR, BGRA, GRAY, RGB, RGBA>;
using Color_Formats = ppl::pipelang::Union<RGB, BGR, GRAY>;

using ImageData =
    Record<RecordField<"width", UInt64>, RecordField<"height", UInt64>,
       RecordField<"data", Bytes>, RecordField<"format", Formats>>;

using Image = Named<"Image", ImageData>;

Convert Input Data into `cv::Mat`

Define the conversion function that converts Input Data into cv::Mat

cv::Mat image(ConstReference<Image> img) {
  int channels = 0;
  int depth = CV_8U;

  auto format = img.get().get<"format">();
  if (format.holds_alternative<Image_BGR>()) {
    channels = 3;
  } else if (format.holds_alternative<Image_BGRA>()) {
    channels = 4;
  } else if (format.holds_alternative<Image_GRAY>()) {
    channels = 1;
  } else if (format.holds_alternative<Image_RGB>()) {
    channels = 3;
  } else {
    // RGBA
    channels = 4;
  }

  cv::Mat mat(img.get().get<"height">(), img.get().get<"width">(),
             CV_MAKETYPE(depth, channels));

  memcpy(mat.data, img.get().get<"data">().data(), img.get().get<"data">().size());

  return mat;
}

Preprocessing

Preprocessing transforms Input matrix into RGB and normalizes it, then cv::Mat transforns it into ppl::infer::Tensor

In Pipelogic library, the inference operation only works with ppl::infer::Tensor.

float scale_factor(int h, int w, int new_h, int new_w) {
  /* returns the largest scale factor to which the image can be rescaled
   * to fit into the size (new_h, new_w)*/
  return std::min(static_cast<float>(new_h) / h, static_cast<float>(new_w) / w);
}

std::vector<std::unique_ptr<ppl::infer::Tensor>> preprocess(const cv::Mat &img, const Params &params) {
  cv::Mat resized_img;
  cv::resize(img, resized_img, cv::Size(params.model_width, params.model_height));

  cv::Mat colored_img;
  cv::cvtColor(resized_img, colored_img, cv::COLOR_BGR2RGB);

  colored_img.convertTo(colored_img, CV_32FC3);

  cv::Mat normalized_img(colored_img.rows, colored_img.cols, CV_32FC3);
  normalized_img = ((colored_img / params.rescale_factor) - params.mean) / params.std;

  int num_channels = normalized_img.channels();
    
  cv::Mat channels[3];
  cv::split(normalized_img, channels);
  int sizes[3] = {num_channels, params.model_height, params.model_width};
  cv::Mat input_mat(3, sizes, CV_32F, cv::Scalar::all(0));
  for (int i = 0; i < num_channels; i++) {
    for (int j = 0; j < params.model_height; j++) {
      memcpy(input_mat.ptr<float>(i, j), channels[i].ptr<float>(j),
       params.model_width * sizeof(float));
    }
  }

  std::shared_ptr<bml::infer::Tensor> input_tensor = std::make_shared<bml::ocv::MatTensor>(input_mat);

  std::vector<std::unique_ptr<bml::infer::Tensor>> input_values;
  input_values.push_back(std::make_unique<bml::infer::ReshapeTensor>(
      std::vector<int64_t>{num_channels, params.model_height, params.model_width},
      input_tensor));

  return input_values;
}

Postprocessing

Process the model detections into ppl::ocv::BoundingBox

ppl::ocv::BoundingBox is a Pipelogic specific bounding box with cv::Rectangle, confidence and object class.

std::vector<bml::ocv::BoundingBox>
postprocess(const std::vector<std::unique_ptr<bml::infer::Tensor>> &outputs,
            const cv::Mat &img, const Params &params) {

  int target_height = img.rows;
  int target_width = img.cols;

  float scale_factor = scale_factor(target_height, target_width,
                                     params.model_height, params.model_width);
  float scale_factor_y = static_cast<float>(params.model_height) / target_height;
  float scale_factor_x = static_cast<float>(params.model_width) / target_width;

  std::vector<bml::ocv::BoundingBox> output_values;
  for (int i = 0; i < params.top_detections; ++i) {
    float score = outputs[0]->at<float>({i, 4});
    int label = outputs[1]->at<int64_t>({0, i});
    if (score <= 0.0 or score > 1.0)
      continue;

    float x0 = outputs[0]->at<float>({i, 0});
    float y0 = outputs[0]->at<float>({i, 1});
    float x1 = outputs[0]->at<float>({i, 2});
    float y1 = outputs[0]->at<float>({i, 3});
    

    int out_x0 = std::max(0, static_cast<int>(x0 / scale_factor_x));
    int out_y0 = std::max(0, static_cast<int>(y0 / scale_factor_y));
    int out_x1 = std::min(img.cols - 1,
                      static_cast<int>(x1 / scale_factor_x));
    int out_y1 = std::min(img.rows - 1,
                      static_cast<int>(y1 / scale_factor_y));

    cv::Point2d tl(out_x0, out_y0);
    cv::Point2d br(out_x1, out_y1);
    output_values.push_back(bml::ocv::BoundingBox(tl, br, label, score));
  }
  return output_values;
}

Inference

Define inference model with triton address and port

std::shared_ptr<ppl::infer::Model> model =
				     std::make_shared<ppl::triton::InferenceModel>(
  			     params.triton_address+ ":" + params.triton_port,
  			     params.model_name);

Initialize Inference detector

// Create object detector for inference
auto detector = std::make_shared<ppl::detect::ObjectDetector>(
     model,
     std::vector<std::string>{"input"}, // model input layer names
     std::vector<std::string>{"dets", "labels"}, // model output layer names
     preprocess, postprocess,
     params.detection_params); // these parameters are define in next section

Run Inference

template <ppl::pipelang::concepts::Static T>
using Rectangle =
    Named<"Rectangle", Record<RecordField<"top_left", Point<T>>,
  			RecordField<"bottom_right", Point<T>>>>;

using DetectedClass =
    Named<"DetectedClass",
      Record<RecordField<"id", UInt64>, RecordField<"confidence", Double>>>;

using BoundingBox =
    Named<"BoundingBox", Record<RecordField<"class", DetectedClass>,
  			  RecordField<"rectangle", Rectangle<Double>>>>;

std::function<ppl::stream::Message(ppl::stream::Message)> function =
    [detector, &params](ppl::stream::Message input_image) {
    cv::Mat img = image(input_image.as<Image>());
    std::vector<ppl::ocv::BoundingBox> bbs;
    // run inference and find detections
    bbs = detector->find_objects(img);
    Object<List<BoundingBox>> output;
    for (const auto &bbox : bbs) {
    	  Object<BoundingBox> out_bbox{std::map<std::string, Object<Any>>{{"class", bbox.class_()}, {"rectangle", bbox.rectangle()}}};
        output.push_back(out_bbox);
    }
    return output;
  }