Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend letterbox behavior #2899

Merged
merged 4 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 11 additions & 23 deletions dlib/image_transforms/interpolation.h
Original file line number Diff line number Diff line change
Expand Up @@ -977,48 +977,36 @@ namespace dlib
point_transform_affine letterbox_image (
const image_type1& img_in,
image_type2& img_out,
long size,
const interpolation_type& interp
)
{
DLIB_CASSERT(size > 0, "size must be bigger than zero, but was " << size);
const_image_view<image_type1> vimg_in(img_in);
image_view<image_type2> vimg_out(img_out);

const auto scale = size / std::max<double>(vimg_in.nr(), vimg_in.nc());
const long rows = vimg_out.nr();
const long cols = vimg_out.nc();
DLIB_CASSERT(vimg_out.size() > 0, "img_out size must be bigger than zero, but was " << rows << "x" << cols);

// early return if the image has already the requested size and no padding is needed
if (scale == 1 && vimg_in.nr() == vimg_in.nc())
if (have_same_dimensions(vimg_in, vimg_out))
{
assign_image(vimg_out, vimg_in);
return point_transform_affine();
}

vimg_out.set_size(size, size);
const double rows_scale = rows / static_cast<double>(vimg_in.nr());
const double cols_scale = cols / static_cast<double>(vimg_in.nc());
const double scale = rows_scale * vimg_in.nc() > rows ? cols_scale : rows_scale;

const long nr = std::round(scale * vimg_in.nr());
const long nc = std::round(scale * vimg_in.nc());
dpoint offset((size - nc) / 2.0, (size - nr) / 2.0);
const long nr = std::lround(scale * vimg_in.nr());
const long nc = std::lround(scale * vimg_in.nc());
const dpoint offset((cols - nc) / 2.0, (rows - nr) / 2.0);
const auto r = rectangle(offset.x(), offset.y(), offset.x() + nc - 1, offset.y() + nr - 1);
zero_border_pixels(vimg_out, r);
auto si = sub_image(img_out, r);
resize_image(vimg_in, si, interp);
return point_transform_affine(identity_matrix<double>(2) * scale, offset);
}

template <
typename image_type1,
typename image_type2
>
point_transform_affine letterbox_image (
const image_type1& img_in,
image_type2& img_out,
long size
)
{
return letterbox_image(img_in, img_out, size, interpolate_bilinear());
}

template <
typename image_type1,
typename image_type2
Expand All @@ -1028,7 +1016,7 @@ namespace dlib
image_type2& img_out
)
{
return letterbox_image(img_in, img_out, std::max(num_rows(img_in), num_columns(img_in)), interpolate_bilinear());
return letterbox_image(img_in, img_out, interpolate_bilinear());
}

// ----------------------------------------------------------------------------------------
Expand Down
45 changes: 5 additions & 40 deletions dlib/image_transforms/interpolation_abstract.h
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,6 @@ namespace dlib
point_transform_affine letterbox_image (
const image_type1& img_in,
image_type2& img_out,
long size
const interpolation_type interp
);
/*!
Expand All @@ -455,51 +454,19 @@ namespace dlib
dlib/image_processing/generic_image.h
- image_type2 == an image object that implements the interface defined in
dlib/image_processing/generic_image.h
- img_out.size() > 0
- interpolation_type == interpolate_nearest_neighbor, interpolate_bilinear,
interpolate_quadratic, or a type with a compatible interface.
- size > 0
- is_same_object(in_img, out_img) == false
ensures
- Scales in_img so that it fits into a size * size square.
In particular, we will have:
- #img_out.nr() == size
- #img_out.nc() == size
- Scales in_img so that it fits into img_out.
- Preserves the aspect ratio of in_img by 0-padding the shortest side.
- Uses the supplied interpolation routine interp to perform the necessary
pixel interpolation.
- Returns a transformation object that maps points in in_img into their
corresponding location in #out_img.
!*/

template <
typename image_type1,
typename image_type2
>
point_transform_affine letterbox_image (
const image_type1& img_in,
image_type2& img_out,
long size
);
/*!
requires
- image_type1 == an image object that implements the interface defined in
dlib/image_processing/generic_image.h
- image_type2 == an image object that implements the interface defined in
dlib/image_processing/generic_image.h
- size > 0
- is_same_object(in_img, out_img) == false
ensures
- Scales in_img so that it fits into a size * size square.
In particular, we will have:
- #img_out.nr() == size
- #img_out.nc() == size
- Preserves the aspect ratio of in_img by 0-padding the shortest side.
- Uses the bilinear interpolation to perform the necessary pixel
interpolation.
- Returns a transformation object that maps points in in_img into their
corresponding location in #out_img.
!*/

template <
typename image_type1,
typename image_type2
Expand All @@ -514,13 +481,11 @@ namespace dlib
dlib/image_processing/generic_image.h
- image_type2 == an image object that implements the interface defined in
dlib/image_processing/generic_image.h
- img_out.size() > 0
- is_same_object(in_img, out_img) == false
ensures
- 0-pads in_img so that it fits into a square whose side is computed as
max(num_rows(in_img), num_columns(in_img)) and stores into #out_img.
In particular, we will have:
- #img_out.nr() == max(num_rows(in_img), num_columns(in_img))
- #img_out.nc() == max(num_rows(in_img), num_columns(in_img))
- Scales in_img so that it fits into img_out using bilinear interpolation.
- Preserves the aspect ratio of in_img by 0-padding the shortest side.
- Returns a transformation object that maps points in in_img into their
corresponding location in #out_img.
!*/
Expand Down
4 changes: 2 additions & 2 deletions dlib/test/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2438,9 +2438,9 @@ namespace
rgb_pixel black(0, 0, 0);
rgb_pixel white(255, 255, 255);
matrix<rgb_pixel> img_s(40, 60);
matrix<rgb_pixel> img_d;
matrix<rgb_pixel> img_d(30, 30);
assign_all_pixels(img_s, white);
const auto tform = letterbox_image(img_s, img_d, 30, interpolate_nearest_neighbor());
const auto tform = letterbox_image(img_s, img_d, interpolate_nearest_neighbor());
DLIB_TEST(tform.get_m() == identity_matrix<double>(2) * 0.5);
DLIB_TEST(tform.get_b() == dpoint(0, 5));

Expand Down
2 changes: 1 addition & 1 deletion docs/docs/imaging.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2008,7 +2008,7 @@
<file>dlib/image_transforms.h</file>
<spec_file link="true">dlib/image_transforms/interpolation_abstract.h</spec_file>
<description>
Scales an image so that it fits into a size * size square, while preserving the aspect
Scales an image so that it fits into another size, while preserving the aspect
ratio of the actual contents by appropriate 0 padding.

<examples>
Expand Down
15 changes: 8 additions & 7 deletions examples/dnn_yolo_train_ex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ namespace darknet
}

// In this example, YOLO expects square images, and we choose to transform them by letterboxing them.
rectangle_transform preprocess_image(const matrix<rgb_pixel>& image, matrix<rgb_pixel>& output, const long image_size)
rectangle_transform preprocess_image(const matrix<rgb_pixel>& image, matrix<rgb_pixel>& output)
{
return rectangle_transform(inv(letterbox_image(image, output, image_size)));
return rectangle_transform(inv(letterbox_image(image, output)));
}

// YOLO outputs the bounding boxes in the coordinate system of the input (letterboxed) image, so we need to convert them
Expand Down Expand Up @@ -296,14 +296,14 @@ try
}
const double threshold = get_option(parser, "test", 0.01);
image_window win;
matrix<rgb_pixel> image, resized;
matrix<rgb_pixel> image, resized(image_size, image_size);
for (const auto& im : dataset.images)
{
win.clear_overlay();
load_image(image, data_directory + "/" + im.filename);
win.set_title(im.filename);
win.set_image(image);
const auto tform = preprocess_image(image, resized, image_size);
const auto tform = preprocess_image(image, resized);
auto detections = net.process(resized, threshold);
postprocess_detections(tform, detections);
cout << "# detections: " << detections.size() << endl;
Expand All @@ -329,7 +329,7 @@ try
cout << "Could not find file " << sync_file_name << endl;
return EXIT_FAILURE;
}
matrix<rgb_pixel> image, resized;
matrix<rgb_pixel> image, resized(image_size, image_size);
std::map<std::string, std::vector<std::pair<double, bool>>> hits;
std::map<std::string, unsigned long> missing;
for (const auto& label : options.labels)
Expand All @@ -342,7 +342,7 @@ try
{
const auto& im = dataset.images[i];
load_image(image, data_directory + "/" + im.filename);
const auto tform = preprocess_image(image, resized, image_size);
const auto tform = preprocess_image(image, resized);
auto dets = net.process(resized, 0.005);
postprocess_detections(tform, dets);
std::vector<bool> used(dets.size(), false);
Expand Down Expand Up @@ -395,6 +395,7 @@ try
dlib::rand rnd(time(nullptr) + seed);
matrix<rgb_pixel> image, rotated;
std::pair<matrix<rgb_pixel>, std::vector<yolo_rect>> temp;
temp.first.set_size(image_size, image_size);
random_cropper cropper;
cropper.set_seed(time(nullptr) + seed);
cropper.set_chip_dims(image_size, image_size);
Expand Down Expand Up @@ -423,7 +424,7 @@ try
for (auto& box : temp.second)
box.rect = tform(box.rect);

tform = letterbox_image(rotated, temp.first, image_size);
tform = letterbox_image(rotated, temp.first);
for (auto& box : temp.second)
box.rect = tform(box.rect);

Expand Down
Loading