Caffe2 - C++ API
A deep learning, cross platform ML framework
video_decoder.cc
1 #include <caffe2/video/video_decoder.h>
2 #include <caffe2/core/logging.h>
3 
4 #include <stdio.h>
5 #include <mutex>
6 #include <random>
7 
8 extern "C" {
9 #include <libavcodec/avcodec.h>
10 #include <libavformat/avformat.h>
11 #include <libavutil/log.h>
12 #include <libswresample/swresample.h>
13 #include <libswscale/swscale.h>
14 }
15 
16 namespace caffe2 {
17 
18 VideoDecoder::VideoDecoder() {
19  static bool gInitialized = false;
20  static std::mutex gMutex;
21  std::unique_lock<std::mutex> lock(gMutex);
22  if (!gInitialized) {
23  av_register_all();
24  avcodec_register_all();
25  avformat_network_init();
26  gInitialized = true;
27  }
28 }
29 
30 void VideoDecoder::ResizeAndKeepAspectRatio(
31  const int origHeight,
32  const int origWidth,
33  const int heightMin,
34  const int widthMin,
35  int& outHeight,
36  int& outWidth) {
37  float min_aspect = (float)heightMin / (float)widthMin;
38  float video_aspect = (float)origHeight / (float)origWidth;
39  if (video_aspect >= min_aspect) {
40  outWidth = widthMin;
41  outHeight = (int)ceil(video_aspect * outWidth);
42  } else {
43  outHeight = heightMin;
44  outWidth = (int)ceil(outHeight / video_aspect);
45  }
46 }
47 
48 void VideoDecoder::decodeLoop(
49  const string& videoName,
50  VideoIOContext& ioctx,
51  const Params& params,
52  const int start_frm,
53  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
54  AVPixelFormat pixFormat = params.pixelFormat_;
55  AVFormatContext* inputContext = avformat_alloc_context();
56  AVStream* videoStream_ = nullptr;
57  AVCodecContext* videoCodecContext_ = nullptr;
58  AVFrame* videoStreamFrame_ = nullptr;
59  AVPacket packet;
60  av_init_packet(&packet); // init packet
61  SwsContext* scaleContext_ = nullptr;
62 
63  try {
64  inputContext->pb = ioctx.get_avio();
65  inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
66  int ret = 0;
67 
68  // Determining the input format:
69  int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
70  DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
71 
72  memset(probe.get(), 0, probeSz);
73  int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
74  if (len < probeSz - AVPROBE_PADDING_SIZE) {
75  LOG(ERROR) << "Insufficient data to determine video format";
76  return;
77  }
78 
79  // seek back to start of stream
80  ioctx.seek(0, SEEK_SET);
81 
82  unique_ptr<AVProbeData> probeData(new AVProbeData());
83  probeData->buf = probe.get();
84  probeData->buf_size = len;
85  probeData->filename = "";
86  // Determine the input-format:
87  inputContext->iformat = av_probe_input_format(probeData.get(), 1);
88 
89  ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
90  if (ret < 0) {
91  LOG(ERROR) << "Unable to open stream " << ffmpegErrorStr(ret);
92  return;
93  }
94 
95  ret = avformat_find_stream_info(inputContext, nullptr);
96  if (ret < 0) {
97  LOG(ERROR) << "Unable to find stream info in " << videoName << " "
98  << ffmpegErrorStr(ret);
99  return;
100  }
101 
102  // Decode the first video stream
103  int videoStreamIndex_ = params.streamIndex_;
104  if (videoStreamIndex_ == -1) {
105  for (int i = 0; i < inputContext->nb_streams; i++) {
106  auto stream = inputContext->streams[i];
107  if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
108  videoStreamIndex_ = i;
109  videoStream_ = stream;
110  break;
111  }
112  }
113  }
114 
115  if (videoStream_ == nullptr) {
116  LOG(ERROR) << "Unable to find video stream in " << videoName << " "
117  << ffmpegErrorStr(ret);
118  return;
119  }
120 
121  // Initialize codec
122  AVDictionary* opts = nullptr;
123  videoCodecContext_ = videoStream_->codec;
124  try {
125  ret = avcodec_open2(
126  videoCodecContext_,
127  avcodec_find_decoder(videoCodecContext_->codec_id),
128  &opts);
129  } catch (const std::exception&) {
130  LOG(ERROR) << "Exception during open video codec";
131  return;
132  }
133 
134  if (ret < 0) {
135  LOG(ERROR) << "Cannot open video codec : "
136  << videoCodecContext_->codec->name;
137  return;
138  }
139 
140  // Calculate if we need to rescale the frames
141  int origWidth = videoCodecContext_->width;
142  int origHeight = videoCodecContext_->height;
143  int outWidth = origWidth;
144  int outHeight = origHeight;
145 
146  if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
147  // if the original resolution is too low,
148  // make its size at least (crop_height, crop_width)
149  if (params.crop_width_ > origWidth || params.crop_height_ > origHeight) {
150  ResizeAndKeepAspectRatio(
151  origHeight,
152  origWidth,
153  params.crop_height_,
154  params.crop_width_,
155  outHeight,
156  outWidth);
157  }
158  } else if (
159  params.video_res_type_ == VideoResType::USE_MINIMAL_WIDTH_HEIGHT) {
160  // resize the image to be at least
161  // (height_min, width_min) resolution while keep the aspect ratio
162  ResizeAndKeepAspectRatio(
163  origHeight,
164  origWidth,
165  params.height_min_,
166  params.width_min_,
167  outHeight,
168  outWidth);
169  } else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
170  // resize the image to the predefined
171  // resolution and ignore the aspect ratio
172  outWidth = params.scale_w_;
173  outHeight = params.scale_h_;
174  } else {
175  LOG(ERROR) << "Unknown video_res_type: " << params.video_res_type_;
176  }
177 
178  // Make sure that we have a valid format
179  CAFFE_ENFORCE_NE(videoCodecContext_->pix_fmt, AV_PIX_FMT_NONE);
180 
181  // Create a scale context
182  scaleContext_ = sws_getContext(
183  videoCodecContext_->width,
184  videoCodecContext_->height,
185  videoCodecContext_->pix_fmt,
186  outWidth,
187  outHeight,
188  pixFormat,
189  SWS_FAST_BILINEAR,
190  nullptr,
191  nullptr,
192  nullptr);
193 
194  // Getting video meta data
195  VideoMeta videoMeta;
196  videoMeta.codec_type = videoCodecContext_->codec_type;
197  videoMeta.width = outWidth;
198  videoMeta.height = outHeight;
199  videoMeta.pixFormat = pixFormat;
200  videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
201 
202  // If sampledFrames is not empty, empty it
203  if (sampledFrames.size() > 0) {
204  sampledFrames.clear();
205  }
206 
207  if (params.intervals_.size() == 0) {
208  LOG(ERROR) << "Empty sampling intervals.";
209  return;
210  }
211 
212  std::vector<SampleInterval>::const_iterator itvlIter =
213  params.intervals_.begin();
214  if (itvlIter->timestamp != 0) {
215  LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
216  }
217 
218  double currFps = itvlIter->fps;
219  if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
220  currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
221  // fps must be 0, -1, -2 or > 0
222  LOG(ERROR) << "Invalid sampling fps.";
223  }
224 
225  double prevTimestamp = itvlIter->timestamp;
226  itvlIter++;
227  if (itvlIter != params.intervals_.end() &&
228  prevTimestamp >= itvlIter->timestamp) {
229  LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
230  }
231 
232  double lastFrameTimestamp = -1.0;
233  // Initialize frame and packet.
234  // These will be reused across calls.
235  videoStreamFrame_ = av_frame_alloc();
236 
237  // frame index in video stream
238  int frameIndex = -1;
239  // frame index of outputed frames
240  int outputFrameIndex = -1;
241 
242  /* identify the starting point from where we must start decoding */
243  std::mt19937 meta_randgen(time(nullptr));
244  long int start_ts = -1;
245  bool mustDecodeAll = false;
246  if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
247  /* we have a valid duration and nb_frames. We can safely
248  * detect an intermediate timestamp to start decoding from. */
249 
250  // leave a margin of 10 frames to take in to account the error
251  // from av_seek_frame
252  long int margin =
253  int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
254  // if we need to do temporal jittering
255  if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
256  /* estimate the average duration for the required # of frames */
257  double maxFramesDuration =
258  (videoStream_->duration * params.num_of_required_frame_) /
259  (videoStream_->nb_frames);
260  int ts1 = 0;
261  int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
262  ts2 = ts2 > 0 ? ts2 : 0;
263  // pick a random timestamp between ts1 and ts2. ts2 is selected such
264  // that you have enough frames to satisfy the required # of frames.
265  start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
266  // seek a frame at start_ts
267  ret = av_seek_frame(
268  inputContext,
269  videoStreamIndex_,
270  0 > (start_ts - margin) ? 0 : (start_ts - margin),
271  AVSEEK_FLAG_BACKWARD);
272 
273  // if we need to decode from the start_frm
274  } else if (params.decode_type_ == DecodeType::USE_START_FRM) {
275  start_ts = int(floor(
276  (videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
277  // seek a frame at start_ts
278  ret = av_seek_frame(
279  inputContext,
280  videoStreamIndex_,
281  0 > (start_ts - margin) ? 0 : (start_ts - margin),
282  AVSEEK_FLAG_BACKWARD);
283  } else {
284  mustDecodeAll = true;
285  }
286 
287  if (ret < 0) {
288  LOG(ERROR) << "Unable to decode from a random start point";
289  /* fall back to default decoding of all frames from start */
290  av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
291  mustDecodeAll = true;
292  }
293  } else {
294  /* we do not have the necessary metadata to selectively decode frames.
295  * Decode all frames as we do in the default case */
296  LOG(INFO) << " Decoding all frames as we do not have suffiecient"
297  " metadata for selective decoding.";
298  mustDecodeAll = true;
299  }
300 
301  int gotPicture = 0;
302  int eof = 0;
303  int selectiveDecodedFrames = 0;
304 
305  int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
306  ? MAX_DECODING_FRAMES
307  : params.num_of_required_frame_;
308  // There is a delay between reading packets from the
309  // transport and getting decoded frames back.
310  // Therefore, after EOF, continue going while
311  // the decoder is still giving us frames.
312  int ipacket = 0;
313  while ((!eof || gotPicture) &&
314  /* either you must decode all frames or decode upto maxFrames
315  * based on status of the mustDecodeAll flag */
316  (mustDecodeAll ||
317  ((!mustDecodeAll) && (selectiveDecodedFrames < maxFrames))) &&
318  /* If on the last interval and not autodecoding keyframes and a
319  * SpecialFps indicates no more frames are needed, stop decoding */
320  !((itvlIter == params.intervals_.end() &&
321  (currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
322  currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
323  !params.keyFrames_)) {
324  try {
325  if (!eof) {
326  ret = av_read_frame(inputContext, &packet);
327  if (ret == AVERROR_EOF) {
328  eof = 1;
329  av_free_packet(&packet);
330  packet.data = nullptr;
331  packet.size = 0;
332  // stay in the while loop to flush frames
333  } else if (ret == AVERROR(EAGAIN)) {
334  av_free_packet(&packet);
335  continue;
336  } else if (ret < 0) {
337  LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
338  }
339  ipacket++;
340 
341  // Ignore packets from other streams
342  if (packet.stream_index != videoStreamIndex_) {
343  av_free_packet(&packet);
344  continue;
345  }
346  }
347 
348  ret = avcodec_decode_video2(
349  videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
350  if (ret < 0) {
351  LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
352  }
353 
354  try {
355  // Nothing to do without a picture
356  if (!gotPicture) {
357  av_free_packet(&packet);
358  continue;
359  }
360  frameIndex++;
361 
362  long int frame_ts =
363  av_frame_get_best_effort_timestamp(videoStreamFrame_);
364  double timestamp = frame_ts * av_q2d(videoStream_->time_base);
365 
366  if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
367  /* process current frame if:
368  * 1) We are not doing selective decoding and mustDecodeAll
369  * OR
370  * 2) We are doing selective decoding and current frame
371  * timestamp is >= start_ts from where we start selective
372  * decoding*/
373  // if reaching the next interval, update the current fps
374  // and reset lastFrameTimestamp so the current frame could be
375  // sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
376  if (itvlIter != params.intervals_.end() &&
377  timestamp >= itvlIter->timestamp) {
378  lastFrameTimestamp = -1.0;
379  currFps = itvlIter->fps;
380  prevTimestamp = itvlIter->timestamp;
381  itvlIter++;
382  if (itvlIter != params.intervals_.end() &&
383  prevTimestamp >= itvlIter->timestamp) {
384  LOG(ERROR)
385  << "Sampling interval timestamps must be strictly ascending.";
386  }
387  }
388 
389  // keyFrame will bypass all checks on fps sampling settings
390  bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
391  if (!keyFrame) {
392  // if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
393  if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
394  av_free_packet(&packet);
395  continue;
396  }
397 
398  // fps is considered reached in the following cases:
399  // 1. lastFrameTimestamp < 0 - start of a new interval
400  // (or first frame)
401  // 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
402  // frame
403  // 3. timestamp - lastFrameTimestamp has reached target fps and
404  // currFps > 0 (not special fps setting)
405  // different modes for fps:
406  // SpecialFps::SAMPLE_NO_FRAMES (0):
407  // disable fps sampling, no frame sampled at all
408  // SpecialFps::SAMPLE_ALL_FRAMES (-1):
409  // unlimited fps sampling, will sample at native video fps
410  // SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
411  // disable fps sampling, but will get the frame at specific
412  // timestamp
413  // others (> 0): decoding at the specified fps
414  bool fpsReached = lastFrameTimestamp < 0 ||
415  currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
416  (currFps > 0 &&
417  timestamp >= lastFrameTimestamp + (1 / currFps));
418 
419  if (!fpsReached) {
420  av_free_packet(&packet);
421  continue;
422  }
423  }
424 
425  lastFrameTimestamp = timestamp;
426 
427  outputFrameIndex++;
428  if (params.maximumOutputFrames_ != -1 &&
429  outputFrameIndex >= params.maximumOutputFrames_) {
430  // enough frames
431  av_free_packet(&packet);
432  break;
433  }
434 
435  AVFrame* rgbFrame = av_frame_alloc();
436  if (!rgbFrame) {
437  LOG(ERROR) << "Error allocating AVframe";
438  }
439 
440  try {
441  // Determine required buffer size and allocate buffer
442  int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
443  DecodedFrame::AvDataPtr buffer(
444  (uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
445 
446  int size = avpicture_fill(
447  (AVPicture*)rgbFrame,
448  buffer.get(),
449  pixFormat,
450  outWidth,
451  outHeight);
452 
453  sws_scale(
454  scaleContext_,
455  videoStreamFrame_->data,
456  videoStreamFrame_->linesize,
457  0,
458  videoCodecContext_->height,
459  rgbFrame->data,
460  rgbFrame->linesize);
461 
462  unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
463  frame->width_ = outWidth;
464  frame->height_ = outHeight;
465  frame->data_ = move(buffer);
466  frame->size_ = size;
467  frame->index_ = frameIndex;
468  frame->outputFrameIndex_ = outputFrameIndex;
469  frame->timestamp_ = timestamp;
470  frame->keyFrame_ = videoStreamFrame_->key_frame;
471 
472  sampledFrames.push_back(move(frame));
473  selectiveDecodedFrames++;
474  av_frame_free(&rgbFrame);
475  } catch (const std::exception&) {
476  av_frame_free(&rgbFrame);
477  }
478  }
479  av_frame_unref(videoStreamFrame_);
480  } catch (const std::exception&) {
481  av_frame_unref(videoStreamFrame_);
482  }
483 
484  av_free_packet(&packet);
485  } catch (const std::exception&) {
486  av_free_packet(&packet);
487  }
488  } // of while loop
489 
490  // free all stuffs
491  sws_freeContext(scaleContext_);
492  av_packet_unref(&packet);
493  av_frame_free(&videoStreamFrame_);
494  avcodec_close(videoCodecContext_);
495  avformat_close_input(&inputContext);
496  avformat_free_context(inputContext);
497  } catch (const std::exception&) {
498  // In case of decoding error
499  // free all stuffs
500  sws_freeContext(scaleContext_);
501  av_packet_unref(&packet);
502  av_frame_free(&videoStreamFrame_);
503  avcodec_close(videoCodecContext_);
504  avformat_close_input(&inputContext);
505  avformat_free_context(inputContext);
506  }
507 }
508 
509 void VideoDecoder::decodeMemory(
510  const char* buffer,
511  const int size,
512  const Params& params,
513  const int start_frm,
514  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
515  VideoIOContext ioctx(buffer, size);
516  decodeLoop(string("Memory Buffer"), ioctx, params, start_frm, sampledFrames);
517 }
518 
519 void VideoDecoder::decodeFile(
520  const string& file,
521  const Params& params,
522  const int start_frm,
523  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
524  VideoIOContext ioctx(file);
525  decodeLoop(file, ioctx, params, start_frm, sampledFrames);
526 }
527 
528 string VideoDecoder::ffmpegErrorStr(int result) {
529  std::array<char, 128> buf;
530  av_strerror(result, buf.data(), buf.size());
531  return string(buf.data());
532 }
533 
534 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13