How to Create Computer Vision Component
In section we will demonstrate how to create a component for detecting bounding boxes using an AI model.
Steps to create Object Detection Component
- Component Configuration
- Parse Configuration Parameters
- Define Input Types
- Convert Input Data into
cv::Mat
- Preprocessing
- Postprocessing
- Inference
- Define Output Types
- Convert Detections into
BoundingBox
Component Configuration
Each function of the component needs to be defined in configuration:
-
The component takes
Image
and outputs list ofBoundingBox
name: "Basic Detect Objects" language: cpp platform: linux/amd64 # extended to include additional CV libraries build_system: 1.1.3-extended tags: ["default", "latest"] worker: input_type: "Image" output_type: "[BoundingBox]" file_schema: model: file_type: "model" file_slot: "" config_key: "model_name" deployable: "triton" is_optional: false config_schema: # Which classes to perform detections for. If empty, detects all classes. classes: type: "[Int64]" default: [] rescale_factor: type: Double default: 1.0 # Threshold for confidence object_confidence_threshold: type: Double default: 0.4 # Non maximum suppression threshold for detections. object_nms_threshold: type: Double default: 0.6 classes_nms_equivalence: type: "[[Int64]]" default: [] object_scale_factor: type: Double default: 1.3 model_height: type: UInt64 model_width: type: UInt64 top_detections: type: UInt64 default: 100 color_model: type: "Image.RGB | Image.BGR | Image.GRAY" default: "Image.RGB ()" mean: type: "(Double, Double, Double)" std: type: "(Double, Double, Double)" input_layer: type: "String" output_label_layer: type: "String" output_detections_layer: type: "String" triton_address: type: "String" triton_port: type: "String" depends_on: - service: triton interface: inference
Parse Configuration Parameters
- The configuration parameters from the previous step needs to be parsed into C++
How to Parse parameters
struct Params {
std::string triton_address;
std::string triton_port;
std::string model_name;
std::string input_name;
std::string output_detections;
std::string output_label;
int model_height;
int model_width;
int top_detections;
cv::Scalar mean;
cv::Scalar std;
cv::Scalar rescale_factor;
bml::detect::ObjectDetectionParameters detection_params;
Params(const std::string &proto_filename) {
const std::map<std::string, Object<Any>> config =
bml::stream::parse_config(proto_filename);
model_name = std::string(
bml::stream::read_config(config, "model_name").as<String>());
input_name = std::string(bml::stream::read_config(config, "input_layer").as<String>());
output_detections = std::string(bml::stream::read_config(config, "output_detections_layer").as<String>());
output_label = std::string(bml::stream::read_config(config, "output_label_layer").as<String>());
double rescale_val =
bml::stream::read_config(config, "rescale_factor").as<Double>();
rescale_factor =
cv::Scalar(1 / rescale_val, 1 / rescale_val, 1 / rescale_val);
triton_address = std::string(bml::stream::read_config(config, "triton_address").as<String>());
triton_port = std::string(bml::stream::read_config(config, "triton_port").as<String>());
top_detections =
bml::stream::read_config(config, "top_detections").as<UInt64>();
ConstReference<List<List<Int64>>> nms_eq_classes =
bml::stream::read_config(config, "classes_nms_equivalence")
.as<List<List<Int64>>>();
for (int i = 0; i < nms_eq_classes.size(); i++) {
ConstReference<List<Int64>> eq_class = nms_eq_classes.data(i);
std::set<int> s{};
for (int j = 0; j < eq_class.size(); j++) {
s.insert(eq_class.data(j));
}
detection_params.classes_nms_equivalence.insert(std::move(s));
}
detection_params.confidence_threshold =
bml::stream::read_config(config, "object_confidence_threshold")
.as<Double>();
detection_params.nms_threshold =
bml::stream::read_config(config, "object_nms_threshold").as<Double>();
detection_params.scale_factor =
bml::stream::read_config(config, "object_scale_factor").as<Double>();
model_height = bml::stream::read_config(config, "model_height").as<UInt64>();
model_width = bml::stream::read_config(config, "model_width").as<UInt64>();
ConstReference<Tuple<Double, Double, Double>> mean_setup =
bml::stream::read_config(config, "mean")
.as<Tuple<Double, Double, Double>>();
ConstReference<Tuple<Double, Double, Double>> std_setup =
bml::stream::read_config(config, "std")
.as<Tuple<Double, Double, Double>>();
mean = cv::Scalar(mean_setup.get<0>(), mean_setup.get<1>(),
mean_setup.get<2>());
std = cv::Scalar(std_setup.get<0>(), std_setup.get<1>(),
std_setup.get<2>());
}
}
Define Input Types
- Define
Image
in C++, so it can be converted into variables
C++ Definition
using BGR = Named<"Image.BGR", Tuple<>>;
using BGRA = Named<"Image.BGRA", Tuple<>>;
using GRAY = Named<"Image.GRAY", Tuple<>>;
using RGB = Named<"Image.RGB", Tuple<>>;
using RGBA = Named<"Image.RGBA", Tuple<>>;
using Formats = ppl::pipelang::Union<BGR, BGRA, GRAY, RGB, RGBA>;
using Color_Formats = ppl::pipelang::Union<RGB, BGR, GRAY>;
using ImageData =
Record<RecordField<"width", UInt64>, RecordField<"height", UInt64>,
RecordField<"data", Bytes>, RecordField<"format", Formats>>;
using Image = Named<"Image", ImageData>;
Convert Input Data into cv::Mat
- Define the conversion function that converts Input Data into
cv::Mat
cv::Mat image(ConstReference<Image> img) {
int channels = 0;
int depth = CV_8U;
auto format = img.get().get<"format">();
if (format.holds_alternative<Image_BGR>()) {
channels = 3;
} else if (format.holds_alternative<Image_BGRA>()) {
channels = 4;
} else if (format.holds_alternative<Image_GRAY>()) {
channels = 1;
} else if (format.holds_alternative<Image_RGB>()) {
channels = 3;
} else {
// RGBA
channels = 4;
}
cv::Mat mat(img.get().get<"height">(), img.get().get<"width">(),
CV_MAKETYPE(depth, channels));
memcpy(mat.data, img.get().get<"data">().data(), img.get().get<"data">().size());
return mat;
}
Preprocessing
- Preprocessing transforms Input matrix into RGB and normalizes it, then
cv::Mat
transforns it intoppl::infer::Tensor
In Pipelogic library, the inference operation only works with ppl::infer::Tensor
.
float scale_factor(int h, int w, int new_h, int new_w) {
/* returns the largest scale factor to which the image can be rescaled
* to fit into the size (new_h, new_w)*/
return std::min(static_cast<float>(new_h) / h, static_cast<float>(new_w) / w);
}
std::vector<std::unique_ptr<ppl::infer::Tensor>> preprocess(const cv::Mat &img, const Params ¶ms) {
cv::Mat resized_img;
cv::resize(img, resized_img, cv::Size(params.model_width, params.model_height));
cv::Mat colored_img;
cv::cvtColor(resized_img, colored_img, cv::COLOR_BGR2RGB);
colored_img.convertTo(colored_img, CV_32FC3);
cv::Mat normalized_img(colored_img.rows, colored_img.cols, CV_32FC3);
normalized_img = ((colored_img / params.rescale_factor) - params.mean) / params.std;
int num_channels = normalized_img.channels();
cv::Mat channels[3];
cv::split(normalized_img, channels);
int sizes[3] = {num_channels, params.model_height, params.model_width};
cv::Mat input_mat(3, sizes, CV_32F, cv::Scalar::all(0));
for (int i = 0; i < num_channels; i++) {
for (int j = 0; j < params.model_height; j++) {
memcpy(input_mat.ptr<float>(i, j), channels[i].ptr<float>(j),
params.model_width * sizeof(float));
}
}
std::shared_ptr<bml::infer::Tensor> input_tensor = std::make_shared<bml::ocv::MatTensor>(input_mat);
std::vector<std::unique_ptr<bml::infer::Tensor>> input_values;
input_values.push_back(std::make_unique<bml::infer::ReshapeTensor>(
std::vector<int64_t>{num_channels, params.model_height, params.model_width},
input_tensor));
return input_values;
}
Postprocessing
- Process the model detections into
ppl::ocv::BoundingBox
ppl::ocv::BoundingBox
is a Pipelogic specific bounding box with cv::Rectangle
, confidence and object class.
std::vector<bml::ocv::BoundingBox>
postprocess(const std::vector<std::unique_ptr<bml::infer::Tensor>> &outputs,
const cv::Mat &img, const Params ¶ms) {
int target_height = img.rows;
int target_width = img.cols;
float scale_factor = scale_factor(target_height, target_width,
params.model_height, params.model_width);
float scale_factor_y = static_cast<float>(params.model_height) / target_height;
float scale_factor_x = static_cast<float>(params.model_width) / target_width;
std::vector<bml::ocv::BoundingBox> output_values;
for (int i = 0; i < params.top_detections; ++i) {
float score = outputs[0]->at<float>({i, 4});
int label = outputs[1]->at<int64_t>({0, i});
if (score <= 0.0 or score > 1.0)
continue;
float x0 = outputs[0]->at<float>({i, 0});
float y0 = outputs[0]->at<float>({i, 1});
float x1 = outputs[0]->at<float>({i, 2});
float y1 = outputs[0]->at<float>({i, 3});
int out_x0 = std::max(0, static_cast<int>(x0 / scale_factor_x));
int out_y0 = std::max(0, static_cast<int>(y0 / scale_factor_y));
int out_x1 = std::min(img.cols - 1,
static_cast<int>(x1 / scale_factor_x));
int out_y1 = std::min(img.rows - 1,
static_cast<int>(y1 / scale_factor_y));
cv::Point2d tl(out_x0, out_y0);
cv::Point2d br(out_x1, out_y1);
output_values.push_back(bml::ocv::BoundingBox(tl, br, label, score));
}
return output_values;
}
Inference
- Define inference model with triton address and port
std::shared_ptr<ppl::infer::Model> model =
std::make_shared<ppl::triton::InferenceModel>(
params.triton_address+ ":" + params.triton_port,
params.model_name);
- Initialize Inference detector
// Create object detector for inference
auto detector = std::make_shared<ppl::detect::ObjectDetector>(
model,
std::vector<std::string>{"input"}, // model input layer names
std::vector<std::string>{"dets", "labels"}, // model output layer names
preprocess, postprocess,
params.detection_params); // these parameters are define in next section
- Run Inference
template <ppl::pipelang::concepts::Static T>
using Rectangle =
Named<"Rectangle", Record<RecordField<"top_left", Point<T>>,
RecordField<"bottom_right", Point<T>>>>;
using DetectedClass =
Named<"DetectedClass",
Record<RecordField<"id", UInt64>, RecordField<"confidence", Double>>>;
using BoundingBox =
Named<"BoundingBox", Record<RecordField<"class", DetectedClass>,
RecordField<"rectangle", Rectangle<Double>>>>;
std::function<ppl::stream::Message(ppl::stream::Message)> function =
[detector, ¶ms](ppl::stream::Message input_image) {
cv::Mat img = image(input_image.as<Image>());
std::vector<ppl::ocv::BoundingBox> bbs;
// run inference and find detections
bbs = detector->find_objects(img);
Object<List<BoundingBox>> output;
for (const auto &bbox : bbs) {
Object<BoundingBox> out_bbox{std::map<std::string, Object<Any>>{{"class", bbox.class_()}, {"rectangle", bbox.rectangle()}}};
output.push_back(out_bbox);
}
return output;
}