Caffe2 - C++ API
A deep learning, cross platform ML framework
video_io.cc
1 
17 #include <caffe2/video/video_io.h>
18 #include <caffe2/core/logging.h>
19 #include <algorithm>
20 #include <random>
21 #include <string>
22 
23 namespace caffe2 {
24 
25 // assume CLHW order and color channels RGB
26 void Saturation(
27  float* clip,
28  const int length,
29  const int crop_height,
30  const int crop_width,
31  const float alpha_rand,
32  std::mt19937* randgen) {
33  float alpha = 1.0f +
34  std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
35 
36  // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
37  const int channel_size = length * crop_height * crop_width;
38  int p = 0;
39  for (int l = 0; l < length; ++l) {
40  for (int h = 0; h < crop_height; ++h) {
41  for (int w = 0; w < crop_width; ++w) {
42  float gray_color = clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
43  clip[p + 2 * channel_size] * 0.114f;
44  for (int c = 0; c < 3; ++c) {
45  clip[c * channel_size + p] =
46  clip[c * channel_size + p] * alpha + gray_color * (1.0f - alpha);
47  }
48  p++;
49  }
50  }
51  }
52 }
53 
54 // assume CLHW order and color channels RGB
55 void Brightness(
56  float* clip,
57  const int length,
58  const int crop_height,
59  const int crop_width,
60  const float alpha_rand,
61  std::mt19937* randgen) {
62  float alpha = 1.0f +
63  std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
64 
65  int p = 0;
66  for (int c = 0; c < 3; ++c) {
67  for (int l = 0; l < length; ++l) {
68  for (int h = 0; h < crop_height; ++h) {
69  for (int w = 0; w < crop_width; ++w) {
70  clip[p++] *= alpha;
71  }
72  }
73  }
74  }
75 }
76 
77 // assume CLHW order and color channels RGB
78 void Contrast(
79  float* clip,
80  const int length,
81  const int crop_height,
82  const int crop_width,
83  const float alpha_rand,
84  std::mt19937* randgen) {
85  const int channel_size = length * crop_height * crop_width;
86  float gray_mean = 0;
87  int p = 0;
88  for (int l = 0; l < length; ++l) {
89  for (int h = 0; h < crop_height; ++h) {
90  for (int w = 0; w < crop_width; ++w) {
91  // RGB to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
92  gray_mean += clip[p] * 0.299f + clip[p + channel_size] * 0.587f +
93  clip[p + 2 * channel_size] * 0.114f;
94  p++;
95  }
96  }
97  }
98  gray_mean /= (length * crop_height * crop_width);
99 
100  float alpha = 1.0f +
101  std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
102  p = 0;
103  for (int c = 0; c < 3; ++c) {
104  for (int l = 0; l < length; ++l) {
105  for (int h = 0; h < crop_height; ++h) {
106  for (int w = 0; w < crop_width; ++w) {
107  clip[p] = clip[p] * alpha + gray_mean * (1.0f - alpha);
108  p++;
109  }
110  }
111  }
112  }
113 }
114 
115 // assume CLHW order and color channels RGB
116 void ColorJitter(
117  float* clip,
118  const int length,
119  const int crop_height,
120  const int crop_width,
121  const float saturation,
122  const float brightness,
123  const float contrast,
124  std::mt19937* randgen) {
125  std::srand(unsigned(std::time(0)));
126  std::vector<int> jitter_order{0, 1, 2};
127  // obtain a time-based seed:
128  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
129  std::shuffle(
130  jitter_order.begin(),
131  jitter_order.end(),
132  std::default_random_engine(seed));
133 
134  for (int i = 0; i < 3; ++i) {
135  if (jitter_order[i] == 0) {
136  Saturation(clip, length, crop_height, crop_width, saturation, randgen);
137  } else if (jitter_order[i] == 1) {
138  Brightness(clip, length, crop_height, crop_width, brightness, randgen);
139  } else {
140  Contrast(clip, length, crop_height, crop_width, contrast, randgen);
141  }
142  }
143 }
144 
145 // assume CLHW order and color channels RGB
146 void ColorLighting(
147  float* clip,
148  const int length,
149  const int crop_height,
150  const int crop_width,
151  const float alpha_std,
152  const std::vector<std::vector<float>>& eigvecs,
153  const std::vector<float>& eigvals,
154  std::mt19937* randgen) {
155  std::normal_distribution<float> d(0, alpha_std);
156  std::vector<float> alphas(3);
157  for (int i = 0; i < 3; ++i) {
158  alphas[i] = d(*randgen);
159  }
160 
161  std::vector<float> delta_rgb(3, 0.0);
162  for (int i = 0; i < 3; ++i) {
163  for (int j = 0; j < 3; ++j) {
164  delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
165  }
166  }
167 
168  int p = 0;
169  for (int c = 0; c < 3; ++c) {
170  for (int l = 0; l < length; ++l) {
171  for (int h = 0; h < crop_height; ++h) {
172  for (int w = 0; w < crop_width; ++w) {
173  clip[p++] += delta_rgb[c];
174  }
175  }
176  }
177  }
178 }
179 
180 // assume CLHW order and color channels RGB
181 // mean subtraction and scaling.
182 void ColorNormalization(
183  float* clip,
184  const int length,
185  const int crop_height,
186  const int crop_width,
187  const int channels,
188  const std::vector<float>& mean,
189  const std::vector<float>& inv_std) {
190  int p = 0;
191  for (int c = 0; c < channels; ++c) {
192  for (int l = 0; l < length; ++l) {
193  for (int h = 0; h < crop_height; ++h) {
194  for (int w = 0; w < crop_width; ++w) {
195  clip[p] = (clip[p] - mean[c]) * inv_std[c];
196  p++;
197  }
198  }
199  }
200  }
201 }
202 
203 void ClipTransformRGB(
204  const unsigned char* buffer_rgb,
205  const int multi_crop_count,
206  const int crop_height,
207  const int crop_width,
208  const int length_rgb,
209  const int channels_rgb,
210  const int sampling_rate_rgb,
211  const int height,
212  const int width,
213  const int h_off,
214  const int w_off,
215  const int* multi_crop_h_off,
216  const int* multi_crop_w_off,
217  const bool mirror_me,
218  const bool color_jitter,
219  const float saturation,
220  const float brightness,
221  const float contrast,
222  const bool color_lighting,
223  const float color_lighting_std,
224  const std::vector<std::vector<float>>& color_lighting_eigvecs,
225  const std::vector<float>& color_lighting_eigvals,
226  const std::vector<float>& mean_rgb,
227  const std::vector<float>& inv_std_rgb,
228  std::mt19937* randgen,
229  float* transformed_clip) {
230  CAFFE_ENFORCE_EQ(
231  channels_rgb, mean_rgb.size(), "rgb channels must be equal to mean size");
232  CAFFE_ENFORCE_EQ(
233  mean_rgb.size(),
234  inv_std_rgb.size(),
235  "mean size must be equal to inv_std size");
236  int orig_index, tran_index;
237  if (multi_crop_count == 1) {
238  // Case 1: Multi_cropping is disabled
239  // The order of output dimensions is C, L, H, W
240  bool do_color_jitter_lighting =
241  (color_jitter || color_lighting) && channels_rgb == 3;
242  for (int c = 0; c < channels_rgb; ++c) {
243  for (int l = 0; l < length_rgb; ++l) {
244  int orig_index_l =
245  l * sampling_rate_rgb * height * width * channels_rgb;
246  int tran_index_l = (c * length_rgb + l) * crop_height;
247 
248  for (int h = 0; h < crop_height; ++h) {
249  int orig_index_h = orig_index_l + (h + h_off) * width * channels_rgb;
250  int tran_index_h = (tran_index_l + h) * crop_width;
251 
252  for (int w = 0; w < crop_width; ++w) {
253  orig_index = orig_index_h + (w + w_off) * channels_rgb + c;
254 
255  // mirror the frame
256  if (mirror_me) {
257  tran_index = tran_index_h + (crop_width - 1 - w);
258  } else {
259  tran_index = tran_index_h + w;
260  }
261 
262  // normalize and transform the clip
263  if (do_color_jitter_lighting) {
264  transformed_clip[tran_index] = buffer_rgb[orig_index];
265  } else {
266  transformed_clip[tran_index] =
267  (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
268  }
269  }
270  }
271  }
272  }
273  if (color_jitter && channels_rgb == 3) {
274  ColorJitter(
275  transformed_clip,
276  length_rgb,
277  crop_height,
278  crop_width,
279  saturation,
280  brightness,
281  contrast,
282  randgen);
283  }
284  if (color_lighting && channels_rgb == 3) {
285  ColorLighting(
286  transformed_clip,
287  length_rgb,
288  crop_height,
289  crop_width,
290  color_lighting_std,
291  color_lighting_eigvecs,
292  color_lighting_eigvals,
293  randgen);
294  }
295  if (do_color_jitter_lighting) {
296  // Color normalization
297  // Mean subtraction and division by standard deviation.
298  ColorNormalization(
299  transformed_clip,
300  length_rgb,
301  crop_height,
302  crop_width,
303  channels_rgb,
304  mean_rgb,
305  inv_std_rgb);
306  }
307  } else {
308  // Case 2: Multi_cropping is enabled. Multi cropping should be only used at
309  // testing stage. So color jittering and lighting are not used
310  for (int multi_crop_mirror = 0; multi_crop_mirror < 2;
311  ++multi_crop_mirror) {
312  for (int i = 0; i < multi_crop_count / 2; ++i) {
313  for (int c = 0; c < channels_rgb; ++c) {
314  for (int l = 0; l < length_rgb; ++l) {
315  int orig_index_l =
316  l * sampling_rate_rgb * height * width * channels_rgb;
317  int tran_index_l = (c * length_rgb + l) * crop_height;
318 
319  for (int h = 0; h < crop_height; ++h) {
320  int orig_index_h = orig_index_l +
321  (h + multi_crop_h_off[i]) * width * channels_rgb;
322  int tran_index_h = (tran_index_l + h) * crop_width;
323 
324  for (int w = 0; w < crop_width; ++w) {
325  orig_index =
326  orig_index_h + (w + multi_crop_w_off[i]) * channels_rgb + c;
327 
328  if (multi_crop_mirror == 1) {
329  tran_index = tran_index_h + (crop_width - 1 - w);
330  } else {
331  tran_index = tran_index_h + w;
332  }
333 
334  transformed_clip[tran_index] =
335  (buffer_rgb[orig_index] - mean_rgb[c]) * inv_std_rgb[c];
336  }
337  }
338  }
339  }
340  transformed_clip +=
341  channels_rgb * length_rgb * crop_height * crop_width;
342  }
343  }
344  }
345 }
346 
347 void ClipTransformOpticalFlow(
348  const unsigned char* buffer_rgb,
349  const int crop_height,
350  const int crop_width,
351  const int length_of,
352  const int channels_of,
353  const int sampling_rate_of,
354  const int height,
355  const int width,
356  const cv::Rect& rect,
357  const int channels_rgb,
358  const bool mirror_me,
359  const int flow_alg_type,
360  const int flow_data_type,
361  const int frame_gap_of,
362  const bool do_flow_aggregation,
363  const std::vector<float>& mean_of,
364  const std::vector<float>& inv_std_of,
365  float* transformed_clip) {
366  const int frame_size = crop_height * crop_width;
367  const int channel_size_flow = length_of * frame_size;
368 
369  // for get the mean and std of the input data
370  bool extract_statistics = false;
371  static std::vector<double> mean_static(channels_of, 0.f);
372  static std::vector<double> std_static(channels_of, 0.f);
373  static long long count = 0;
374  cv::Scalar mean_img, std_img;
375 
376  for (int l = 0; l < length_of; l++) {
377  // get the grayscale frames
378  std::vector<cv::Mat> grays, rgbs;
379  int step_size = do_flow_aggregation ? 1 : frame_gap_of;
380  for (int j = 0; j <= frame_gap_of; j += step_size) {
381  // get the current frame
382  const unsigned char* curr_frame = buffer_rgb +
383  (l * sampling_rate_of + j) * height * width * channels_rgb;
384  cv::Mat img = cv::Mat::zeros(height, width, CV_8UC3);
385  memcpy(
386  img.data,
387  curr_frame,
388  height * width * channels_rgb * sizeof(unsigned char));
389 
390  // crop and mirror the frame
391  cv::Mat img_cropped = img(rect);
392  if (mirror_me) {
393  cv::flip(img_cropped, img_cropped, 1);
394  }
395 
396  cv::Mat gray;
397  cv::cvtColor(img_cropped, gray, cv::COLOR_RGB2GRAY);
398  grays.push_back(gray);
399  rgbs.push_back(img_cropped);
400  }
401 
402  cv::Mat first_gray, first_rgb;
403  cv::Mat flow = cv::Mat::zeros(crop_height, crop_width, CV_32FC2);
404  MultiFrameOpticalFlowExtractor(grays, flow_alg_type, flow);
405 
406  std::vector<cv::Mat> imgs;
407  cv::split(flow, imgs);
408  // save the 2-channel optical flow first
409  int c = 0;
410  for (; c < 2; c++) {
411  if (extract_statistics) {
412  cv::meanStdDev(imgs[c], mean_img, std_img);
413  mean_static[c] += mean_img[0];
414  std_static[c] += std_img[0];
415  }
416 
417  imgs[c] -= mean_of[c];
418  imgs[c] *= inv_std_of[c];
419  memcpy(
420  transformed_clip + c * channel_size_flow + l * frame_size,
421  imgs[c].data,
422  frame_size * sizeof(float));
423  }
424 
425  cv::Mat mag;
426  std::vector<cv::Mat> chans;
427  // augment the optical flow with more channels
428  switch (flow_data_type) {
429  case FlowDataType::Flow2C:
430  // nothing to do if we only need two channels
431  break;
432 
433  case FlowDataType::Flow3C:
434  // use magnitude as the third channel
435  mag = cv::abs(imgs[0]) + cv::abs(imgs[1]);
436  if (extract_statistics) {
437  cv::meanStdDev(mag, mean_img, std_img);
438  mean_static[c] += mean_img[0];
439  std_static[c] += std_img[0];
440  }
441 
442  mag -= mean_of[c];
443  mag *= inv_std_of[c];
444  memcpy(
445  transformed_clip + c * channel_size_flow + l * frame_size,
446  mag.data,
447  frame_size * sizeof(float));
448  break;
449 
450  case FlowDataType::FlowWithGray:
451  // add grayscale image as the third channel
452  grays[0].convertTo(first_gray, CV_32FC1);
453  if (extract_statistics) {
454  cv::meanStdDev(first_gray, mean_img, std_img);
455  mean_static[c] += mean_img[0];
456  std_static[c] += std_img[0];
457  }
458 
459  first_gray -= mean_of[c];
460  first_gray *= inv_std_of[c];
461  memcpy(
462  transformed_clip + c * channel_size_flow + l * frame_size,
463  first_gray.data,
464  frame_size * sizeof(float));
465  break;
466 
467  case FlowDataType::FlowWithRGB:
468  // add all three rgb channels
469  rgbs[0].convertTo(first_rgb, CV_32FC3);
470  cv::split(first_rgb, chans);
471  for (; c < channels_of; c++) {
472  if (extract_statistics) {
473  cv::meanStdDev(chans[c - 2], mean_img, std_img);
474  mean_static[c] += mean_img[0];
475  std_static[c] += std_img[0];
476  }
477 
478  chans[c - 2] -= mean_of[c];
479  chans[c - 2] *= inv_std_of[c];
480  memcpy(
481  transformed_clip + c * channel_size_flow + l * frame_size,
482  chans[c - 2].data,
483  frame_size * sizeof(float));
484  }
485  break;
486 
487  default:
488  LOG(ERROR) << "Unsupported optical flow data type " << flow_data_type;
489  break;
490  }
491 
492  if (extract_statistics) {
493  count++;
494  if (count % 1000 == 1) {
495  for (int i = 0; i < channels_of; i++) {
496  LOG(INFO) << i
497  << "-th channel mean: " << mean_static[i] / float(count)
498  << " std: " << std_static[i] / float(count);
499  }
500  }
501  }
502  }
503 }
504 
505 void FreeDecodedData(
506  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
507  // free the sampledFrames
508  for (int i = 0; i < sampledFrames.size(); i++) {
509  DecodedFrame* p = sampledFrames[i].release();
510  delete p;
511  }
512  sampledFrames.clear();
513 }
514 
515 bool DecodeMultipleClipsFromVideo(
516  const char* video_buffer,
517  const std::string& video_filename,
518  const int encoded_size,
519  const Params& params,
520  const int start_frm,
521  const int clip_per_video,
522  const bool use_local_file,
523  int& height,
524  int& width,
525  std::vector<unsigned char*>& buffer_rgb) {
526  std::vector<std::unique_ptr<DecodedFrame>> sampledFrames;
527  VideoDecoder decoder;
528 
529  // decoding from buffer or file
530  if (!use_local_file) {
531  decoder.decodeMemory(
532  video_buffer, encoded_size, params, start_frm, sampledFrames);
533  } else {
534  decoder.decodeFile(video_filename, params, start_frm, sampledFrames);
535  }
536 
537  for (int i = 0; i < buffer_rgb.size(); i++) {
538  unsigned char* buff = buffer_rgb[i];
539  delete[] buff;
540  }
541  buffer_rgb.clear();
542 
543  if (sampledFrames.size() < params.num_of_required_frame_) {
544  LOG(ERROR)
545  << "The video seems faulty and we could not decode enough frames: "
546  << sampledFrames.size() << " VS " << params.num_of_required_frame_;
547  FreeDecodedData(sampledFrames);
548  return true;
549  }
550 
551  height = sampledFrames[0]->height_;
552  width = sampledFrames[0]->width_;
553  float sample_stepsz = 1.0;
554  if (clip_per_video > 1) {
555  sample_stepsz =
556  float(sampledFrames.size() - params.num_of_required_frame_) /
557  (clip_per_video - 1.0);
558  }
559  int image_size = 3 * height * width;
560  int clip_size = params.num_of_required_frame_ * image_size;
561  // get the RGB frames for each clip
562  for (int i = 0; i < clip_per_video; i++) {
563  unsigned char* buffer_rgb_ptr = new unsigned char[clip_size];
564  int clip_start = floor(i * sample_stepsz);
565  for (int j = 0; j < params.num_of_required_frame_; j++) {
566  memcpy(
567  buffer_rgb_ptr + j * image_size,
568  (unsigned char*)sampledFrames[j + clip_start]->data_.get(),
569  image_size * sizeof(unsigned char));
570  }
571  buffer_rgb.push_back(buffer_rgb_ptr);
572  }
573  FreeDecodedData(sampledFrames);
574 
575  return true;
576 }
577 
578 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.