1 #include <caffe2/video/video_io.h> 2 #include <caffe2/core/logging.h> 13 const int crop_height,
15 const float alpha_rand,
16 std::mt19937* randgen) {
18 std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
21 const int channel_size = length * crop_height * crop_width;
23 for (
int l = 0; l < length; ++l) {
24 for (
int h = 0; h < crop_height; ++h) {
25 for (
int w = 0; w < crop_width; ++w) {
26 float gray_color = clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
27 clip[p + 2 * channel_size] * 0.114f;
28 for (
int c = 0; c < 3; ++c) {
29 clip[c * channel_size + p] =
30 clip[c * channel_size + p] * alpha + gray_color * (1.0f - alpha);
42 const int crop_height,
44 const float alpha_rand,
45 std::mt19937* randgen) {
47 std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
50 for (
int c = 0; c < 3; ++c) {
51 for (
int l = 0; l < length; ++l) {
52 for (
int h = 0; h < crop_height; ++h) {
53 for (
int w = 0; w < crop_width; ++w) {
65 const int crop_height,
67 const float alpha_rand,
68 std::mt19937* randgen) {
69 const int channel_size = length * crop_height * crop_width;
72 for (
int l = 0; l < length; ++l) {
73 for (
int h = 0; h < crop_height; ++h) {
74 for (
int w = 0; w < crop_width; ++w) {
76 gray_mean += clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
77 clip[p + 2 * channel_size] * 0.114f;
82 gray_mean /= (length * crop_height * crop_width);
85 std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
87 for (
int c = 0; c < 3; ++c) {
88 for (
int l = 0; l < length; ++l) {
89 for (
int h = 0; h < crop_height; ++h) {
90 for (
int w = 0; w < crop_width; ++w) {
91 clip[p] = clip[p] * alpha + gray_mean * (1.0f - alpha);
103 const int crop_height,
104 const int crop_width,
105 const float saturation,
106 const float brightness,
107 const float contrast,
108 std::mt19937* randgen) {
109 std::srand(
unsigned(std::time(0)));
110 std::vector<int> jitter_order{0, 1, 2};
112 unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
114 jitter_order.begin(),
116 std::default_random_engine(seed));
118 for (
int i = 0; i < 3; ++i) {
119 if (jitter_order[i] == 0) {
120 Saturation(clip, length, crop_height, crop_width, saturation, randgen);
121 }
else if (jitter_order[i] == 1) {
122 Brightness(clip, length, crop_height, crop_width, brightness, randgen);
124 Contrast(clip, length, crop_height, crop_width, contrast, randgen);
133 const int crop_height,
134 const int crop_width,
135 const float alpha_std,
136 const std::vector<std::vector<float>>& eigvecs,
137 const std::vector<float>& eigvals,
138 std::mt19937* randgen) {
139 std::normal_distribution<float> d(0, alpha_std);
140 std::vector<float> alphas(3);
141 for (
int i = 0; i < 3; ++i) {
142 alphas[i] = d(*randgen);
145 std::vector<float> delta_rgb(3, 0.0);
146 for (
int i = 0; i < 3; ++i) {
147 for (
int j = 0; j < 3; ++j) {
148 delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
153 for (
int c = 0; c < 3; ++c) {
154 for (
int l = 0; l < length; ++l) {
155 for (
int h = 0; h < crop_height; ++h) {
156 for (
int w = 0; w < crop_width; ++w) {
157 clip[p++] += delta_rgb[c];
166 void ColorNormalization(
169 const int crop_height,
170 const int crop_width,
172 const std::vector<float>& mean,
173 const std::vector<float>& inv_std) {
175 for (
int c = 0; c < channels; ++c) {
176 for (
int l = 0; l < length; ++l) {
177 for (
int h = 0; h < crop_height; ++h) {
178 for (
int w = 0; w < crop_width; ++w) {
179 clip[p] = (clip[p] - mean[c]) * inv_std[c];
187 void ClipTransformRGB(
188 const unsigned char* buffer_rgb,
189 const int multi_crop_count,
190 const int crop_height,
191 const int crop_width,
192 const int length_rgb,
193 const int channels_rgb,
194 const int sampling_rate_rgb,
199 const int* multi_crop_h_off,
200 const int* multi_crop_w_off,
201 const bool mirror_me,
202 const bool color_jitter,
203 const float saturation,
204 const float brightness,
205 const float contrast,
206 const bool color_lighting,
207 const float color_lighting_std,
208 const std::vector<std::vector<float>>& color_lighting_eigvecs,
209 const std::vector<float>& color_lighting_eigvals,
210 const std::vector<float>& mean_rgb,
211 const std::vector<float>& inv_std_rgb,
212 std::mt19937* randgen,
213 float* transformed_clip) {
215 channels_rgb, mean_rgb.size(),
"rgb channels must be equal to mean size");
219 "mean size must be equal to inv_std size");
220 int orig_index, tran_index;
221 if (multi_crop_count == 1) {
224 bool do_color_jitter_lighting =
225 (color_jitter || color_lighting) && channels_rgb == 3;
226 for (
int c = 0; c < channels_rgb; ++c) {
227 for (
int l = 0; l < length_rgb; ++l) {
229 l * sampling_rate_rgb * height * width * channels_rgb;
230 int tran_index_l = (c * length_rgb + l) * crop_height;
232 for (
int h = 0; h < crop_height; ++h) {
233 int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
234 int tran_index_h = (tran_index_l + h) * crop_width;
236 for (
int w = 0; w < crop_width; ++w) {
237 orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
241 tran_index = tran_index_h + (crop_width - 1 - w);
243 tran_index = tran_index_h + w;
247 if (do_color_jitter_lighting) {
248 transformed_clip[tran_index] = buffer_rgb[orig_index];
250 transformed_clip[tran_index] =
251 (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
257 if (color_jitter && channels_rgb == 3) {
268 if (color_lighting && channels_rgb == 3) {
275 color_lighting_eigvecs,
276 color_lighting_eigvals,
279 if (do_color_jitter_lighting) {
294 for (
int multi_crop_mirror = 0; multi_crop_mirror < 2;
295 ++multi_crop_mirror) {
296 for (
int i = 0; i < multi_crop_count / 2; ++i) {
297 for (
int c = 0; c < channels_rgb; ++c) {
298 for (
int l = 0; l < length_rgb; ++l) {
300 l * sampling_rate_rgb * height * width * channels_rgb;
301 int tran_index_l = (c * length_rgb + l) * crop_height;
303 for (
int h = 0; h < crop_height; ++h) {
304 int orig_index_h = orig_index_l +
305 (h + multi_crop_h_off[i]) * width * channels_rgb;
306 int tran_index_h = (tran_index_l + h) * crop_width;
308 for (
int w = 0; w < crop_width; ++w) {
310 orig_index_h + (w + multi_crop_w_off[i]) * channels_rgb + c;
312 if (multi_crop_mirror == 1) {
313 tran_index = tran_index_h + (crop_width - 1 - w);
315 tran_index = tran_index_h + w;
318 transformed_clip[tran_index] =
319 (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
325 channels_rgb * length_rgb * crop_height * crop_width;
331 void ClipTransformOpticalFlow(
332 const unsigned char* buffer_rgb,
333 const int crop_height,
334 const int crop_width,
336 const int channels_of,
337 const int sampling_rate_of,
340 const cv::Rect& rect,
341 const int channels_rgb,
342 const bool mirror_me,
343 const int flow_alg_type,
344 const int flow_data_type,
345 const int frame_gap_of,
346 const bool do_flow_aggregation,
347 const std::vector<float>& mean_of,
348 const std::vector<float>& inv_std_of,
349 float* transformed_clip) {
350 const int frame_size = crop_height * crop_width;
351 const int channel_size_flow = length_of * frame_size;
354 bool extract_statistics =
false;
355 static std::vector<double> mean_static(channels_of, 0.f);
356 static std::vector<double> std_static(channels_of, 0.f);
357 static long long count = 0;
358 cv::Scalar mean_img, std_img;
360 for (
int l = 0; l < length_of; l++) {
362 std::vector<cv::Mat> grays, rgbs;
363 int step_size = do_flow_aggregation ? 1 : frame_gap_of;
364 for (
int j = 0; j <= frame_gap_of; j += step_size) {
366 const unsigned char* curr_frame = buffer_rgb +
367 (l * sampling_rate_of + j) * height * width * channels_rgb;
368 cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
372 height * width * channels_rgb *
sizeof(
unsigned char));
375 cv::Mat img_cropped = img(rect);
377 cv::flip(img_cropped, img_cropped, 1);
381 cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
382 grays.push_back(gray);
383 rgbs.push_back(img_cropped);
386 cv::Mat first_gray, first_rgb;
387 cv::Mat flow = cv::Mat::zeros(crop_height, crop_width, CV_32FC2);
388 MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
390 std::vector<cv::Mat> imgs;
391 cv::split(flow, imgs);
395 if (extract_statistics) {
396 cv::meanStdDev(imgs[c], mean_img, std_img);
397 mean_static[c] += mean_img[0];
398 std_static[c] += std_img[0];
401 imgs[c] -= mean_of[c];
402 imgs[c] *= inv_std_of[c];
404 transformed_clip + c * channel_size_flow + l * frame_size,
406 frame_size *
sizeof(
float));
410 std::vector<cv::Mat> chans;
412 switch (flow_data_type) {
413 case FlowDataType::Flow2C:
417 case FlowDataType::Flow3C:
419 mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
420 if (extract_statistics) {
421 cv::meanStdDev(mag, mean_img, std_img);
422 mean_static[c] += mean_img[0];
423 std_static[c] += std_img[0];
427 mag *= inv_std_of[c];
429 transformed_clip + c * channel_size_flow + l * frame_size,
431 frame_size *
sizeof(
float));
434 case FlowDataType::FlowWithGray:
436 grays[0].convertTo(first_gray, CV_32FC1);
437 if (extract_statistics) {
438 cv::meanStdDev(first_gray, mean_img, std_img);
439 mean_static[c] += mean_img[0];
440 std_static[c] += std_img[0];
443 first_gray -= mean_of[c];
444 first_gray *= inv_std_of[c];
446 transformed_clip + c * channel_size_flow + l * frame_size,
448 frame_size *
sizeof(
float));
451 case FlowDataType::FlowWithRGB:
453 rgbs[0].convertTo(first_rgb, CV_32FC3);
454 cv::split(first_rgb, chans);
455 for (; c < channels_of; c++) {
456 if (extract_statistics) {
457 cv::meanStdDev(chans[c - 2], mean_img, std_img);
458 mean_static[c] += mean_img[0];
459 std_static[c] += std_img[0];
462 chans[c - 2] -= mean_of[c];
463 chans[c - 2] *= inv_std_of[c];
465 transformed_clip + c * channel_size_flow + l * frame_size,
467 frame_size *
sizeof(
float));
472 LOG(ERROR) <<
"Unsupported optical flow data type " << flow_data_type;
476 if (extract_statistics) {
478 if (count % 1000 == 1) {
479 for (
int i = 0; i < channels_of; i++) {
481 <<
"-th channel mean: " << mean_static[i] / float(count)
482 <<
" std: " << std_static[i] / float(count);
489 void FreeDecodedData(
490 std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
492 for (
int i = 0; i < sampledFrames.size(); i++) {
493 DecodedFrame* p = sampledFrames[i].release();
496 sampledFrames.clear();
499 bool DecodeMultipleClipsFromVideo(
500 const char* video_buffer,
501 const std::string& video_filename,
502 const int encoded_size,
503 const Params& params,
505 const int clip_per_video,
506 const bool use_local_file,
509 std::vector<unsigned char*>& buffer_rgb) {
510 std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
511 VideoDecoder decoder;
514 if (!use_local_file) {
515 decoder.decodeMemory(
516 video_buffer, encoded_size, params, start_frm, sampledFrames);
518 decoder.decodeFile(video_filename, params, start_frm, sampledFrames);
521 for (
int i = 0; i < buffer_rgb.size(); i++) {
522 unsigned char* buff = buffer_rgb[i];
527 if (sampledFrames.size() < params.num_of_required_frame_) {
532 FreeDecodedData(sampledFrames);
536 height = sampledFrames[0]->height_;
537 width = sampledFrames[0]->width_;
538 float sample_stepsz = (clip_per_video <= 1)
540 : (
float(sampledFrames.size() - params.num_of_required_frame_) /
541 (clip_per_video - 1));
543 int image_size = 3 * height * width;
544 int clip_size = params.num_of_required_frame_ * image_size;
546 for (
int i = 0; i < clip_per_video; i++) {
547 unsigned char* buffer_rgb_ptr =
new unsigned char[clip_size];
548 int clip_start = floor(i * sample_stepsz);
549 for (
int j = 0; j < params.num_of_required_frame_; j++) {
551 buffer_rgb_ptr + j * image_size,
552 (
unsigned char*)sampledFrames[j + clip_start]->data_.get(),
553 image_size *
sizeof(
unsigned char));
555 buffer_rgb.push_back(buffer_rgb_ptr);
557 FreeDecodedData(sampledFrames);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...