Caffe2 - C++ API
A deep learning, cross platform ML framework
video_io.cc
1 #include <caffe2/video/video_io.h>
2 #include <caffe2/core/logging.h>
3 #include <algorithm>
4 #include <random>
5 #include <string>
6 
7 namespace caffe2 {
8 
9 // assume CLHW order and color channels RGB
10 void Saturation(
11  float* clip,
12  const int length,
13  const int crop_height,
14  const int crop_width,
15  const float alpha_rand,
16  std::mt19937* randgen) {
17  float alpha = 1.0f +
18  std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
19 
20  // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
21  const int channel_size = length * crop_height * crop_width;
22  int p = 0;
23  for (int l = 0; l < length; ++l) {
24  for (int h = 0; h < crop_height; ++h) {
25  for (int w = 0; w < crop_width; ++w) {
26  float gray_color = clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
27  clip[p + 2 * channel_size] * 0.114f;
28  for (int c = 0; c < 3; ++c) {
29  clip[c * channel_size + p] =
30  clip[c * channel_size + p] * alpha + gray_color * (1.0f - alpha);
31  }
32  p++;
33  }
34  }
35  }
36 }
37 
38 // assume CLHW order and color channels RGB
39 void Brightness(
40  float* clip,
41  const int length,
42  const int crop_height,
43  const int crop_width,
44  const float alpha_rand,
45  std::mt19937* randgen) {
46  float alpha = 1.0f +
47  std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
48 
49  int p = 0;
50  for (int c = 0; c < 3; ++c) {
51  for (int l = 0; l < length; ++l) {
52  for (int h = 0; h < crop_height; ++h) {
53  for (int w = 0; w < crop_width; ++w) {
54  clip[p++] *= alpha;
55  }
56  }
57  }
58  }
59 }
60 
61 // assume CLHW order and color channels RGB
62 void Contrast(
63  float* clip,
64  const int length,
65  const int crop_height,
66  const int crop_width,
67  const float alpha_rand,
68  std::mt19937* randgen) {
69  const int channel_size = length * crop_height * crop_width;
70  float gray_mean = 0;
71  int p = 0;
72  for (int l = 0; l < length; ++l) {
73  for (int h = 0; h < crop_height; ++h) {
74  for (int w = 0; w < crop_width; ++w) {
75  // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
76  gray_mean += clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
77  clip[p + 2 * channel_size] * 0.114f;
78  p++;
79  }
80  }
81  }
82  gray_mean /= (length * crop_height * crop_width);
83 
84  float alpha = 1.0f +
85  std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
86  p = 0;
87  for (int c = 0; c < 3; ++c) {
88  for (int l = 0; l < length; ++l) {
89  for (int h = 0; h < crop_height; ++h) {
90  for (int w = 0; w < crop_width; ++w) {
91  clip[p] = clip[p] * alpha + gray_mean * (1.0f - alpha);
92  p++;
93  }
94  }
95  }
96  }
97 }
98 
99 // assume CLHW order and color channels RGB
100 void ColorJitter(
101  float* clip,
102  const int length,
103  const int crop_height,
104  const int crop_width,
105  const float saturation,
106  const float brightness,
107  const float contrast,
108  std::mt19937* randgen) {
109  std::srand(unsigned(std::time(0)));
110  std::vector<int> jitter_order{0, 1, 2};
111  // obtain a time-based seed:
112  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
113  std::shuffle(
114  jitter_order.begin(),
115  jitter_order.end(),
116  std::default_random_engine(seed));
117 
118  for (int i = 0; i < 3; ++i) {
119  if (jitter_order[i] == 0) {
120  Saturation(clip, length, crop_height, crop_width, saturation, randgen);
121  } else if (jitter_order[i] == 1) {
122  Brightness(clip, length, crop_height, crop_width, brightness, randgen);
123  } else {
124  Contrast(clip, length, crop_height, crop_width, contrast, randgen);
125  }
126  }
127 }
128 
129 // assume CLHW order and color channels RGB
130 void ColorLighting(
131  float* clip,
132  const int length,
133  const int crop_height,
134  const int crop_width,
135  const float alpha_std,
136  const std::vector<std::vector<float>>& eigvecs,
137  const std::vector<float>& eigvals,
138  std::mt19937* randgen) {
139  std::normal_distribution<float> d(0, alpha_std);
140  std::vector<float> alphas(3);
141  for (int i = 0; i < 3; ++i) {
142  alphas[i] = d(*randgen);
143  }
144 
145  std::vector<float> delta_rgb(3, 0.0);
146  for (int i = 0; i < 3; ++i) {
147  for (int j = 0; j < 3; ++j) {
148  delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
149  }
150  }
151 
152  int p = 0;
153  for (int c = 0; c < 3; ++c) {
154  for (int l = 0; l < length; ++l) {
155  for (int h = 0; h < crop_height; ++h) {
156  for (int w = 0; w < crop_width; ++w) {
157  clip[p++] += delta_rgb[c];
158  }
159  }
160  }
161  }
162 }
163 
164 // assume CLHW order and color channels RGB
165 // mean subtraction and scaling.
166 void ColorNormalization(
167  float* clip,
168  const int length,
169  const int crop_height,
170  const int crop_width,
171  const int channels,
172  const std::vector<float>& mean,
173  const std::vector<float>& inv_std) {
174  int p = 0;
175  for (int c = 0; c < channels; ++c) {
176  for (int l = 0; l < length; ++l) {
177  for (int h = 0; h < crop_height; ++h) {
178  for (int w = 0; w < crop_width; ++w) {
179  clip[p] = (clip[p] - mean[c]) * inv_std[c];
180  p++;
181  }
182  }
183  }
184  }
185 }
186 
187 void ClipTransformRGB(
188  const unsigned char* buffer_rgb,
189  const int multi_crop_count,
190  const int crop_height,
191  const int crop_width,
192  const int length_rgb,
193  const int channels_rgb,
194  const int sampling_rate_rgb,
195  const int height,
196  const int width,
197  const int h_off,
198  const int w_off,
199  const int* multi_crop_h_off,
200  const int* multi_crop_w_off,
201  const bool mirror_me,
202  const bool color_jitter,
203  const float saturation,
204  const float brightness,
205  const float contrast,
206  const bool color_lighting,
207  const float color_lighting_std,
208  const std::vector<std::vector<float>>& color_lighting_eigvecs,
209  const std::vector<float>& color_lighting_eigvals,
210  const std::vector<float>& mean_rgb,
211  const std::vector<float>& inv_std_rgb,
212  std::mt19937* randgen,
213  float* transformed_clip) {
214  CAFFE_ENFORCE_EQ(
215  channels_rgb, mean_rgb.size(), "rgb channels must be equal to mean size");
216  CAFFE_ENFORCE_EQ(
217  mean_rgb.size(),
218  inv_std_rgb.size(),
219  "mean size must be equal to inv_std size");
220  int orig_index, tran_index;
221  if (multi_crop_count == 1) {
222  // Case 1: Multi_cropping is disabled
223  // The order of output dimensions is C, L, H, W
224  bool do_color_jitter_lighting =
225  (color_jitter || color_lighting) && channels_rgb == 3;
226  for (int c = 0; c < channels_rgb; ++c) {
227  for (int l = 0; l < length_rgb; ++l) {
228  int orig_index_l =
229  l * sampling_rate_rgb * height * width * channels_rgb;
230  int tran_index_l = (c * length_rgb + l) * crop_height;
231 
232  for (int h = 0; h < crop_height; ++h) {
233  int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
234  int tran_index_h = (tran_index_l + h) * crop_width;
235 
236  for (int w = 0; w < crop_width; ++w) {
237  orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
238 
239  // mirror the frame
240  if (mirror_me) {
241  tran_index = tran_index_h + (crop_width - 1 - w);
242  } else {
243  tran_index = tran_index_h + w;
244  }
245 
246  // normalize and transform the clip
247  if (do_color_jitter_lighting) {
248  transformed_clip[tran_index] = buffer_rgb[orig_index];
249  } else {
250  transformed_clip[tran_index] =
251  (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
252  }
253  }
254  }
255  }
256  }
257  if (color_jitter && channels_rgb == 3) {
258  ColorJitter(
259  transformed_clip,
260  length_rgb,
261  crop_height,
262  crop_width,
263  saturation,
264  brightness,
265  contrast,
266  randgen);
267  }
268  if (color_lighting && channels_rgb == 3) {
269  ColorLighting(
270  transformed_clip,
271  length_rgb,
272  crop_height,
273  crop_width,
274  color_lighting_std,
275  color_lighting_eigvecs,
276  color_lighting_eigvals,
277  randgen);
278  }
279  if (do_color_jitter_lighting) {
280  // Color normalization
281  // Mean subtraction and division by standard deviation.
282  ColorNormalization(
283  transformed_clip,
284  length_rgb,
285  crop_height,
286  crop_width,
287  channels_rgb,
288  mean_rgb,
289  inv_std_rgb);
290  }
291  } else {
292  // Case 2: Multi_cropping is enabled. Multi cropping should be only used at
293  // testing stage. So color jittering and lighting are not used
294  for (int multi_crop_mirror = 0; multi_crop_mirror < 2;
295  ++multi_crop_mirror) {
296  for (int i = 0; i < multi_crop_count / 2; ++i) {
297  for (int c = 0; c < channels_rgb; ++c) {
298  for (int l = 0; l < length_rgb; ++l) {
299  int orig_index_l =
300  l * sampling_rate_rgb * height * width * channels_rgb;
301  int tran_index_l = (c * length_rgb + l) * crop_height;
302 
303  for (int h = 0; h < crop_height; ++h) {
304  int orig_index_h = orig_index_l +
305  (h + multi_crop_h_off[i]) * width * channels_rgb;
306  int tran_index_h = (tran_index_l + h) * crop_width;
307 
308  for (int w = 0; w < crop_width; ++w) {
309  orig_index =
310  orig_index_h + (w + multi_crop_w_off[i]) * channels_rgb + c;
311 
312  if (multi_crop_mirror == 1) {
313  tran_index = tran_index_h + (crop_width - 1 - w);
314  } else {
315  tran_index = tran_index_h + w;
316  }
317 
318  transformed_clip[tran_index] =
319  (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
320  }
321  }
322  }
323  }
324  transformed_clip +=
325  channels_rgb * length_rgb * crop_height * crop_width;
326  }
327  }
328  }
329 }
330 
331 void ClipTransformOpticalFlow(
332  const unsigned char* buffer_rgb,
333  const int crop_height,
334  const int crop_width,
335  const int length_of,
336  const int channels_of,
337  const int sampling_rate_of,
338  const int height,
339  const int width,
340  const cv::Rect& rect,
341  const int channels_rgb,
342  const bool mirror_me,
343  const int flow_alg_type,
344  const int flow_data_type,
345  const int frame_gap_of,
346  const bool do_flow_aggregation,
347  const std::vector<float>& mean_of,
348  const std::vector<float>& inv_std_of,
349  float* transformed_clip) {
350  const int frame_size = crop_height * crop_width;
351  const int channel_size_flow = length_of * frame_size;
352 
353  // for get the mean and std of the input data
354  bool extract_statistics = false;
355  static std::vector<double> mean_static(channels_of, 0.f);
356  static std::vector<double> std_static(channels_of, 0.f);
357  static long long count = 0;
358  cv::Scalar mean_img, std_img;
359 
360  for (int l = 0; l < length_of; l++) {
361  // get the grayscale frames
362  std::vector<cv::Mat> grays, rgbs;
363  int step_size = do_flow_aggregation ? 1 : frame_gap_of;
364  for (int j = 0; j <= frame_gap_of; j += step_size) {
365  // get the current frame
366  const unsigned char* curr_frame = buffer_rgb +
367  (l * sampling_rate_of + j) * height * width * channels_rgb;
368  cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
369  memcpy(
370  img.data,
371  curr_frame,
372  height * width * channels_rgb * sizeof(unsigned char));
373 
374  // crop and mirror the frame
375  cv::Mat img_cropped = img(rect);
376  if (mirror_me) {
377  cv::flip(img_cropped, img_cropped, 1);
378  }
379 
380  cv::Mat gray;
381  cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
382  grays.push_back(gray);
383  rgbs.push_back(img_cropped);
384  }
385 
386  cv::Mat first_gray, first_rgb;
387  cv::Mat flow = cv::Mat::zeros(crop_height, crop_width, CV_32FC2);
388  MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
389 
390  std::vector<cv::Mat> imgs;
391  cv::split(flow, imgs);
392  // save the 2-channel optical flow first
393  int c = 0;
394  for (; c < 2; c++) {
395  if (extract_statistics) {
396  cv::meanStdDev(imgs[c], mean_img, std_img);
397  mean_static[c] += mean_img[0];
398  std_static[c] += std_img[0];
399  }
400 
401  imgs[c] -= mean_of[c];
402  imgs[c] *= inv_std_of[c];
403  memcpy(
404  transformed_clip + c * channel_size_flow + l * frame_size,
405  imgs[c].data,
406  frame_size * sizeof(float));
407  }
408 
409  cv::Mat mag;
410  std::vector<cv::Mat> chans;
411  // augment the optical flow with more channels
412  switch (flow_data_type) {
413  case FlowDataType::Flow2C:
414  // nothing to do if we only need two channels
415  break;
416 
417  case FlowDataType::Flow3C:
418  // use magnitude as the third channel
419  mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
420  if (extract_statistics) {
421  cv::meanStdDev(mag, mean_img, std_img);
422  mean_static[c] += mean_img[0];
423  std_static[c] += std_img[0];
424  }
425 
426  mag -= mean_of[c];
427  mag *= inv_std_of[c];
428  memcpy(
429  transformed_clip + c * channel_size_flow + l * frame_size,
430  mag.data,
431  frame_size * sizeof(float));
432  break;
433 
434  case FlowDataType::FlowWithGray:
435  // add grayscale image as the third channel
436  grays[0].convertTo(first_gray, CV_32FC1);
437  if (extract_statistics) {
438  cv::meanStdDev(first_gray, mean_img, std_img);
439  mean_static[c] += mean_img[0];
440  std_static[c] += std_img[0];
441  }
442 
443  first_gray -= mean_of[c];
444  first_gray *= inv_std_of[c];
445  memcpy(
446  transformed_clip + c * channel_size_flow + l * frame_size,
447  first_gray.data,
448  frame_size * sizeof(float));
449  break;
450 
451  case FlowDataType::FlowWithRGB:
452  // add all three rgb channels
453  rgbs[0].convertTo(first_rgb, CV_32FC3);
454  cv::split(first_rgb, chans);
455  for (; c < channels_of; c++) {
456  if (extract_statistics) {
457  cv::meanStdDev(chans[c - 2], mean_img, std_img);
458  mean_static[c] += mean_img[0];
459  std_static[c] += std_img[0];
460  }
461 
462  chans[c - 2] -= mean_of[c];
463  chans[c - 2] *= inv_std_of[c];
464  memcpy(
465  transformed_clip + c * channel_size_flow + l * frame_size,
466  chans[c - 2].data,
467  frame_size * sizeof(float));
468  }
469  break;
470 
471  default:
472  LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
473  break;
474  }
475 
476  if (extract_statistics) {
477  count++;
478  if (count % 1000 == 1) {
479  for (int i = 0; i < channels_of; i++) {
480  LOG(INFO) << i
481  << "-th channel mean: " << mean_static[i] / float(count)
482  << " std: " << std_static[i] / float(count);
483  }
484  }
485  }
486  }
487 }
488 
489 void FreeDecodedData(
490  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
491  // free the sampledFrames
492  for (int i = 0; i < sampledFrames.size(); i++) {
493  DecodedFrame* p = sampledFrames[i].release();
494  delete p;
495  }
496  sampledFrames.clear();
497 }
498 
499 bool DecodeMultipleClipsFromVideo(
500  const char* video_buffer,
501  const std::string& video_filename,
502  const int encoded_size,
503  const Params& params,
504  const int start_frm,
505  const int clip_per_video,
506  const bool use_local_file,
507  int& height,
508  int& width,
509  std::vector<unsigned char*>& buffer_rgb) {
510  std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
511  VideoDecoder decoder;
512 
513  // decoding from buffer or file
514  if (!use_local_file) {
515  decoder.decodeMemory(
516  video_buffer, encoded_size, params, start_frm, sampledFrames);
517  } else {
518  decoder.decodeFile(video_filename, params, start_frm, sampledFrames);
519  }
520 
521  for (int i = 0; i < buffer_rgb.size(); i++) {
522  unsigned char* buff = buffer_rgb[i];
523  delete[] buff;
524  }
525  buffer_rgb.clear();
526 
527  if (sampledFrames.size() < params.num_of_required_frame_) {
528  // LOG(ERROR) << "The video seems faulty and we could not decode enough
529  // frames: "
530  // << sampledFrames.size() << " VS " <<
531  // params.num_of_required_frame_;
532  FreeDecodedData(sampledFrames);
533  return true;
534  }
535 
536  height = sampledFrames[0]->height_;
537  width = sampledFrames[0]->width_;
538  float sample_stepsz = (clip_per_video <= 1)
539  ? 0
540  : (float(sampledFrames.size() - params.num_of_required_frame_) /
541  (clip_per_video - 1));
542 
543  int image_size = 3 * height * width;
544  int clip_size = params.num_of_required_frame_ * image_size;
545  // get the RGB frames for each clip
546  for (int i = 0; i < clip_per_video; i++) {
547  unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
548  int clip_start = floor(i * sample_stepsz);
549  for (int j = 0; j < params.num_of_required_frame_; j++) {
550  memcpy(
551  buffer_rgb_ptr + j * image_size,
552  (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
553  image_size * sizeof(unsigned char));
554  }
555  buffer_rgb.push_back(buffer_rgb_ptr);
556  }
557  FreeDecodedData(sampledFrames);
558 
559  return true;
560 }
561 
562 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13