Caffe2 - C++ API
A deep learning, cross platform ML framework
video_decoder.cc
1 
17 #include <caffe2/video/video_decoder.h>
18 #include <caffe2/core/logging.h>
19 
20 #include <stdio.h>
21 #include <mutex>
22 #include <random>
23 
24 extern "C" {
25 #include <libavcodec/avcodec.h>
26 #include <libavformat/avformat.h>
27 #include <libavutil/log.h>
28 #include <libswresample/swresample.h>
29 #include <libswscale/swscale.h>
30 }
31 
32 namespace caffe2 {
33 
34 VideoDecoder::VideoDecoder() {
35  static bool gInitialized = false;
36  static std::mutex gMutex;
37  std::unique_lock<std::mutex> lock(gMutex);
38  if (!gInitialized) {
39  av_register_all();
40  avcodec_register_all();
41  avformat_network_init();
42  gInitialized = true;
43  }
44 }
45 
46 void VideoDecoder::ResizeAndKeepAspectRatio(
47  const int origHeight,
48  const int origWidth,
49  const int heightMin,
50  const int widthMin,
51  int& outHeight,
52  int& outWidth) {
53  float min_aspect = (float)heightMin / (float)widthMin;
54  float video_aspect = (float)origHeight / (float)origWidth;
55  if (video_aspect >= min_aspect) {
56  outWidth = widthMin;
57  outHeight = (int)ceil(video_aspect * outWidth);
58  } else {
59  outHeight = heightMin;
60  outWidth = (int)ceil(outHeight / video_aspect);
61  }
62 }
63 
64 void VideoDecoder::decodeLoop(
65  const string& videoName,
66  VideoIOContext& ioctx,
67  const Params& params,
68  const int start_frm,
69  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
70  AVPixelFormat pixFormat = params.pixelFormat_;
71  AVFormatContext* inputContext = avformat_alloc_context();
72  AVStream* videoStream_ = nullptr;
73  AVCodecContext* videoCodecContext_ = nullptr;
74  AVFrame* videoStreamFrame_ = nullptr;
75  AVPacket packet;
76  av_init_packet(&packet); // init packet
77  SwsContext* scaleContext_ = nullptr;
78 
79  try {
80  inputContext->pb = ioctx.get_avio();
81  inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
82  int ret = 0;
83 
84  // Determining the input format:
85  int probeSz = 32 * 1024 + AVPROBE_PADDING_SIZE;
86  DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
87 
88  memset(probe.get(), 0, probeSz);
89  int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
90  if (len < probeSz - AVPROBE_PADDING_SIZE) {
91  LOG(ERROR) << "Insufficient data to determine video format";
92  return;
93  }
94 
95  // seek back to start of stream
96  ioctx.seek(0, SEEK_SET);
97 
98  unique_ptr<AVProbeData> probeData(new AVProbeData());
99  probeData->buf = probe.get();
100  probeData->buf_size = len;
101  probeData->filename = "";
102  // Determine the input-format:
103  inputContext->iformat = av_probe_input_format(probeData.get(), 1);
104 
105  ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
106  if (ret < 0) {
107  LOG(ERROR) << "Unable to open stream " << ffmpegErrorStr(ret);
108  return;
109  }
110 
111  ret = avformat_find_stream_info(inputContext, nullptr);
112  if (ret < 0) {
113  LOG(ERROR) << "Unable to find stream info in " << videoName << " "
114  << ffmpegErrorStr(ret);
115  return;
116  }
117 
118  // Decode the first video stream
119  int videoStreamIndex_ = params.streamIndex_;
120  if (videoStreamIndex_ == -1) {
121  for (int i = 0; i < inputContext->nb_streams; i++) {
122  auto stream = inputContext->streams[i];
123  if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
124  videoStreamIndex_ = i;
125  videoStream_ = stream;
126  break;
127  }
128  }
129  }
130 
131  if (videoStream_ == nullptr) {
132  LOG(ERROR) << "Unable to find video stream in " << videoName << " "
133  << ffmpegErrorStr(ret);
134  return;
135  }
136 
137  // Initialize codec
138  AVDictionary* opts = nullptr;
139  videoCodecContext_ = videoStream_->codec;
140  try {
141  ret = avcodec_open2(
142  videoCodecContext_,
143  avcodec_find_decoder(videoCodecContext_->codec_id),
144  &opts);
145  } catch (const std::exception&) {
146  LOG(ERROR) << "Exception during open video codec";
147  return;
148  }
149 
150  if (ret < 0) {
151  LOG(ERROR) << "Cannot open video codec : "
152  << videoCodecContext_->codec->name;
153  return;
154  }
155 
156  // Calculate if we need to rescale the frames
157  int origWidth = videoCodecContext_->width;
158  int origHeight = videoCodecContext_->height;
159  int outWidth = origWidth;
160  int outHeight = origHeight;
161 
162  if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
163  // if the original resolution is too low,
164  // make its size at least (crop_height, crop_width)
165  if (params.crop_width_ > origWidth || params.crop_height_ > origHeight) {
166  ResizeAndKeepAspectRatio(
167  origHeight,
168  origWidth,
169  params.crop_height_,
170  params.crop_width_,
171  outHeight,
172  outWidth);
173  }
174  } else if (
175  params.video_res_type_ == VideoResType::USE_MINIMAL_WIDTH_HEIGHT) {
176  // resize the image to be at least
177  // (height_min, width_min) resolution while keep the aspect ratio
178  ResizeAndKeepAspectRatio(
179  origHeight,
180  origWidth,
181  params.height_min_,
182  params.width_min_,
183  outHeight,
184  outWidth);
185  } else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
186  // resize the image to the predefined
187  // resolution and ignore the aspect ratio
188  outWidth = params.scale_w_;
189  outHeight = params.scale_h_;
190  } else {
191  LOG(ERROR) << "Unknown video_res_type: " << params.video_res_type_;
192  }
193 
194  // Make sure that we have a valid format
195  CAFFE_ENFORCE_NE(videoCodecContext_->pix_fmt, AV_PIX_FMT_NONE);
196 
197  // Create a scale context
198  scaleContext_ = sws_getContext(
199  videoCodecContext_->width,
200  videoCodecContext_->height,
201  videoCodecContext_->pix_fmt,
202  outWidth,
203  outHeight,
204  pixFormat,
205  SWS_FAST_BILINEAR,
206  nullptr,
207  nullptr,
208  nullptr);
209 
210  // Getting video meta data
211  VideoMeta videoMeta;
212  videoMeta.codec_type = videoCodecContext_->codec_type;
213  videoMeta.width = outWidth;
214  videoMeta.height = outHeight;
215  videoMeta.pixFormat = pixFormat;
216  videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
217 
218  // If sampledFrames is not empty, empty it
219  if (sampledFrames.size() > 0) {
220  sampledFrames.clear();
221  }
222 
223  if (params.intervals_.size() == 0) {
224  LOG(ERROR) << "Empty sampling intervals.";
225  return;
226  }
227 
228  std::vector<SampleInterval>::const_iterator itvlIter =
229  params.intervals_.begin();
230  if (itvlIter->timestamp != 0) {
231  LOG(ERROR) << "Sampling interval starting timestamp is not zero.";
232  }
233 
234  double currFps = itvlIter->fps;
235  if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
236  currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
237  // fps must be 0, -1, -2 or > 0
238  LOG(ERROR) << "Invalid sampling fps.";
239  }
240 
241  double prevTimestamp = itvlIter->timestamp;
242  itvlIter++;
243  if (itvlIter != params.intervals_.end() &&
244  prevTimestamp >= itvlIter->timestamp) {
245  LOG(ERROR) << "Sampling interval timestamps must be strictly ascending.";
246  }
247 
248  double lastFrameTimestamp = -1.0;
249  // Initialize frame and packet.
250  // These will be reused across calls.
251  videoStreamFrame_ = av_frame_alloc();
252 
253  // frame index in video stream
254  int frameIndex = -1;
255  // frame index of outputed frames
256  int outputFrameIndex = -1;
257 
258  /* identify the starting point from where we must start decoding */
259  std::mt19937 meta_randgen(time(nullptr));
260  int start_ts = -1;
261  bool mustDecodeAll = false;
262  if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
263  /* we have a valid duration and nb_frames. We can safely
264  * detect an intermediate timestamp to start decoding from. */
265 
266  // leave a margin of 10 frames to take in to account the error
267  // from av_seek_frame
268  int margin =
269  int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
270  // if we need to do temporal jittering
271  if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
272  /* estimate the average duration for the required # of frames */
273  double maxFramesDuration =
274  (videoStream_->duration * params.num_of_required_frame_) /
275  (videoStream_->nb_frames);
276  int ts1 = 0;
277  int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
278  ts2 = ts2 > 0 ? ts2 : 0;
279  // pick a random timestamp between ts1 and ts2. ts2 is selected such
280  // that you have enough frames to satisfy the required # of frames.
281  start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
282  // seek a frame at start_ts
283  ret = av_seek_frame(
284  inputContext,
285  videoStreamIndex_,
286  std::max(0, start_ts - margin),
287  AVSEEK_FLAG_BACKWARD);
288 
289  // if we need to decode from the start_frm
290  } else if (params.decode_type_ == DecodeType::USE_START_FRM) {
291  start_ts = int(floor(
292  (videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
293  // seek a frame at start_ts
294  ret = av_seek_frame(
295  inputContext,
296  videoStreamIndex_,
297  std::max(0, start_ts - margin),
298  AVSEEK_FLAG_BACKWARD);
299  } else {
300  mustDecodeAll = true;
301  }
302 
303  if (ret < 0) {
304  LOG(ERROR) << "Unable to decode from a random start point";
305  /* fall back to default decoding of all frames from start */
306  av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
307  mustDecodeAll = true;
308  }
309  } else {
310  /* we do not have the necessary metadata to selectively decode frames.
311  * Decode all frames as we do in the default case */
312  LOG(INFO) << " Decoding all frames as we do not have suffiecient"
313  " metadata for selective decoding.";
314  mustDecodeAll = true;
315  }
316 
317  int gotPicture = 0;
318  int eof = 0;
319  int selectiveDecodedFrames = 0;
320 
321  int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
322  ? MAX_DECODING_FRAMES
323  : params.num_of_required_frame_;
324  // There is a delay between reading packets from the
325  // transport and getting decoded frames back.
326  // Therefore, after EOF, continue going while
327  // the decoder is still giving us frames.
328  while ((!eof || gotPicture) &&
329  /* either you must decode all frames or decode upto maxFrames
330  * based on status of the mustDecodeAll flag */
331  (mustDecodeAll ||
332  ((!mustDecodeAll) && (selectiveDecodedFrames < maxFrames)))) {
333  try {
334  if (!eof) {
335  ret = av_read_frame(inputContext, &packet);
336 
337  if (ret == AVERROR(EAGAIN)) {
338  av_free_packet(&packet);
339  continue;
340  }
341  // Interpret any other error as EOF
342  if (ret < 0) {
343  eof = 1;
344  av_free_packet(&packet);
345  continue;
346  }
347 
348  // Ignore packets from other streams
349  if (packet.stream_index != videoStreamIndex_) {
350  av_free_packet(&packet);
351  continue;
352  }
353  }
354 
355  ret = avcodec_decode_video2(
356  videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
357  if (ret < 0) {
358  LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
359  }
360 
361  try {
362  // Nothing to do without a picture
363  if (!gotPicture) {
364  av_free_packet(&packet);
365  continue;
366  }
367  frameIndex++;
368 
369  double frame_ts =
370  av_frame_get_best_effort_timestamp(videoStreamFrame_);
371  double timestamp = frame_ts * av_q2d(videoStream_->time_base);
372 
373  if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
374  /* process current frame if:
375  * 1) We are not doing selective decoding and mustDecodeAll
376  * OR
377  * 2) We are doing selective decoding and current frame
378  * timestamp is >= start_ts from where we start selective
379  * decoding*/
380  // if reaching the next interval, update the current fps
381  // and reset lastFrameTimestamp so the current frame could be
382  // sampled (unless fps == SpecialFps::SAMPLE_NO_FRAME)
383  if (itvlIter != params.intervals_.end() &&
384  timestamp >= itvlIter->timestamp) {
385  lastFrameTimestamp = -1.0;
386  currFps = itvlIter->fps;
387  prevTimestamp = itvlIter->timestamp;
388  itvlIter++;
389  if (itvlIter != params.intervals_.end() &&
390  prevTimestamp >= itvlIter->timestamp) {
391  LOG(ERROR)
392  << "Sampling interval timestamps must be strictly ascending.";
393  }
394  }
395 
396  // keyFrame will bypass all checks on fps sampling settings
397  bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
398  if (!keyFrame) {
399  // if fps == SpecialFps::SAMPLE_NO_FRAME (0), don't sample at all
400  if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
401  av_free_packet(&packet);
402  continue;
403  }
404 
405  // fps is considered reached in the following cases:
406  // 1. lastFrameTimestamp < 0 - start of a new interval
407  // (or first frame)
408  // 2. currFps == SpecialFps::SAMPLE_ALL_FRAMES (-1) - sample every
409  // frame
410  // 3. timestamp - lastFrameTimestamp has reached target fps and
411  // currFps > 0 (not special fps setting)
412  // different modes for fps:
413  // SpecialFps::SAMPLE_NO_FRAMES (0):
414  // disable fps sampling, no frame sampled at all
415  // SpecialFps::SAMPLE_ALL_FRAMES (-1):
416  // unlimited fps sampling, will sample at native video fps
417  // SpecialFps::SAMPLE_TIMESTAMP_ONLY (-2):
418  // disable fps sampling, but will get the frame at specific
419  // timestamp
420  // others (> 0): decoding at the specified fps
421  bool fpsReached = lastFrameTimestamp < 0 ||
422  currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
423  (currFps > 0 &&
424  timestamp >= lastFrameTimestamp + (1 / currFps));
425 
426  if (!fpsReached) {
427  av_free_packet(&packet);
428  continue;
429  }
430  }
431 
432  lastFrameTimestamp = timestamp;
433 
434  outputFrameIndex++;
435  if (params.maximumOutputFrames_ != -1 &&
436  outputFrameIndex >= params.maximumOutputFrames_) {
437  // enough frames
438  av_free_packet(&packet);
439  break;
440  }
441 
442  AVFrame* rgbFrame = av_frame_alloc();
443  if (!rgbFrame) {
444  LOG(ERROR) << "Error allocating AVframe";
445  }
446 
447  try {
448  // Determine required buffer size and allocate buffer
449  int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
450  DecodedFrame::AvDataPtr buffer(
451  (uint8_t*)av_malloc(numBytes * sizeof(uint8_t)));
452 
453  int size = avpicture_fill(
454  (AVPicture*)rgbFrame,
455  buffer.get(),
456  pixFormat,
457  outWidth,
458  outHeight);
459 
460  sws_scale(
461  scaleContext_,
462  videoStreamFrame_->data,
463  videoStreamFrame_->linesize,
464  0,
465  videoCodecContext_->height,
466  rgbFrame->data,
467  rgbFrame->linesize);
468 
469  unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
470  frame->width_ = outWidth;
471  frame->height_ = outHeight;
472  frame->data_ = move(buffer);
473  frame->size_ = size;
474  frame->index_ = frameIndex;
475  frame->outputFrameIndex_ = outputFrameIndex;
476  frame->timestamp_ = timestamp;
477  frame->keyFrame_ = videoStreamFrame_->key_frame;
478 
479  sampledFrames.push_back(move(frame));
480  selectiveDecodedFrames++;
481  av_frame_free(&rgbFrame);
482  } catch (const std::exception&) {
483  av_frame_free(&rgbFrame);
484  }
485  }
486  av_frame_unref(videoStreamFrame_);
487  } catch (const std::exception&) {
488  av_frame_unref(videoStreamFrame_);
489  }
490 
491  av_free_packet(&packet);
492  } catch (const std::exception&) {
493  av_free_packet(&packet);
494  }
495  } // of while loop
496 
497  // free all stuffs
498  sws_freeContext(scaleContext_);
499  av_packet_unref(&packet);
500  av_frame_free(&videoStreamFrame_);
501  avcodec_close(videoCodecContext_);
502  avformat_close_input(&inputContext);
503  avformat_free_context(inputContext);
504  } catch (const std::exception&) {
505  // In case of decoding error
506  // free all stuffs
507  sws_freeContext(scaleContext_);
508  av_packet_unref(&packet);
509  av_frame_free(&videoStreamFrame_);
510  avcodec_close(videoCodecContext_);
511  avformat_close_input(&inputContext);
512  avformat_free_context(inputContext);
513  }
514 }
515 
516 void VideoDecoder::decodeMemory(
517  const char* buffer,
518  const int size,
519  const Params& params,
520  const int start_frm,
521  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
522  VideoIOContext ioctx(buffer, size);
523  decodeLoop(string("Memory Buffer"), ioctx, params, start_frm, sampledFrames);
524 }
525 
526 void VideoDecoder::decodeFile(
527  const string& file,
528  const Params& params,
529  const int start_frm,
530  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
531  VideoIOContext ioctx(file);
532  decodeLoop(file, ioctx, params, start_frm, sampledFrames);
533 }
534 
535 string VideoDecoder::ffmpegErrorStr(int result) {
536  std::array<char, 128> buf;
537  av_strerror(result, buf.data(), buf.size());
538  return string(buf.data());
539 }
540 
541 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.