Caffe2 - C++ API
A deep learning, cross platform ML framework
video_decoder.h
1 
17 #ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
18 #define CAFFE2_VIDEO_VIDEO_DECODER_H_
19 
20 #include <caffe2/core/logging.h>
21 #include <stdio.h>
22 #include <memory>
23 #include <string>
24 #include <vector>
25 
26 extern "C" {
27 #include <libavformat/avformat.h>
28 #include <libavformat/avio.h>
29 }
30 
31 namespace caffe2 {
32 
33 #define VIO_BUFFER_SZ 32768
34 #define MAX_DECODING_FRAMES 10000
35 
36 // enum to specify 3 special fps sampling behaviors:
37 // 0: disable fps sampling, no frame sampled at all
38 // -1: unlimited fps sampling, will sample at native video fps
39 // -2: disable fps sampling, but will get the frame at specific timestamp
40 enum SpecialFps {
41  SAMPLE_NO_FRAME = 0,
42  SAMPLE_ALL_FRAMES = -1,
43  SAMPLE_TIMESTAMP_ONLY = -2,
44 };
45 
46 // three different types of resolution when decoding the video
47 // 0: resize to width x height and ignore the aspect ratio;
48 // 1: resize to make size at least (width x height) and keep the aspect ratio;
49 // 2: using the original resolution of the video; if resolution
50 // is smaller than crop_height x crop_width, resize to ensure
51 // new height >= crop_height and new width >= crop_width
52 // and keep the aspect ratio;
53 enum VideoResType {
54  USE_WIDTH_HEIGHT = 0,
55  USE_MINIMAL_WIDTH_HEIGHT = 1,
56  ORIGINAL_RES = 2,
57 };
58 
59 // three different types of decoding behavior are supported
60 // 0: do temporal jittering to sample a random clip from the video
61 // 1: sample a clip from a given starting frame
62 // 2: uniformly sample multiple clips from the video;
63 enum DecodeType {
64  DO_TMP_JITTER = 0,
65  DO_UNIFORM_SMP = 1,
66  USE_START_FRM = 2,
67 };
68 
69 // sampling interval for fps starting at specified timestamp
70 // use enum SpecialFps to set special fps decoding behavior
71 // note sampled fps will not always accurately follow the target fps,
72 // because sampled frame has to snap to actual frame timestamp,
73 // e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
74 // video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
75 // because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
77  double timestamp;
78  double fps;
79  SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
80  SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
81  bool operator<(const SampleInterval& itvl) const {
82  return (timestamp < itvl.timestamp);
83  }
84 };
85 
86 class Params {
87  public:
88  // return all key-frames regardless of specified fps
89  bool keyFrames_ = false;
90 
91  // Output image pixel format
92  AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
93 
94  // Index of stream to decode.
95  // -1 will automatically decode the first video stream.
96  int streamIndex_ = -1;
97 
98  // How many frames to output at most from the video
99  // -1 no limit
100  int maximumOutputFrames_ = -1;
101 
102  // params for video resolution
103  int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
104  int crop_height_ = -1;
105  int crop_width_ = -1;
106  int height_min_ = -1;
107  int width_min_ = -1;
108  int scale_w_ = -1;
109  int scale_h_ = -1;
110 
111  // params for decoding behavior
112  int decode_type_ = DecodeType::DO_TMP_JITTER;
113  int num_of_required_frame_ = -1;
114 
115  // intervals_ control variable sampling fps between different timestamps
116  // intervals_ must be ordered strictly ascending by timestamps
117  // the first interval must have a timestamp of zero
118  // fps must be either the 3 special fps defined in SpecialFps, or > 0
119  std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
120 
121  Params() {}
122 
128  Params& fps(float v) {
129  intervals_.clear();
130  intervals_.emplace_back(0, v);
131  return *this;
132  }
133 
137  Params& pixelFormat(AVPixelFormat pixelFormat) {
138  pixelFormat_ = pixelFormat;
139  return *this;
140  }
141 
145  Params& keyFrames(bool keyFrames) {
146  keyFrames_ = keyFrames;
147  return *this;
148  }
149 
153  Params& streamIndex(int index) {
154  streamIndex_ = index;
155  return *this;
156  }
157 
161  Params& maxOutputFrames(int count) {
162  maximumOutputFrames_ = count;
163  return *this;
164  }
165 
169  Params& outputWidth(int width) {
170  scale_w_ = width;
171  return *this;
172  }
173 
177  Params& outputHeight(int height) {
178  scale_h_ = height;
179  return *this;
180  }
181 };
182 
183 // data structure for storing decoded video frames
185  public:
186  struct avDeleter {
187  void operator()(unsigned char* p) const {
188  av_free(p);
189  }
190  };
191  using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
192 
193  // decoded data buffer
194  AvDataPtr data_;
195 
196  // size in bytes
197  int size_ = 0;
198 
199  // frame dimensions
200  int width_ = 0;
201  int height_ = 0;
202 
203  // timestamp in seconds since beginning of video
204  double timestamp_ = 0;
205 
206  // true if this is a key frame.
207  bool keyFrame_ = false;
208 
209  // index of frame in video
210  int index_ = -1;
211 
212  // Sequential number of outputted frame
213  int outputFrameIndex_ = -1;
214 };
215 
217  public:
218  explicit VideoIOContext(const std::string& fname)
219  : workBuffersize_(VIO_BUFFER_SZ),
220  workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
221  inputFile_(nullptr),
222  inputBuffer_(nullptr),
223  inputBufferSize_(0) {
224  inputFile_ = fopen(fname.c_str(), "rb");
225  if (inputFile_ == nullptr) {
226  LOG(ERROR) << "Error opening video file " << fname;
227  }
228  ctx_ = avio_alloc_context(
229  static_cast<unsigned char*>(workBuffer_.get()),
230  workBuffersize_,
231  0,
232  this,
233  &VideoIOContext::readFile,
234  nullptr, // no write function
235  &VideoIOContext::seekFile);
236  }
237 
238  explicit VideoIOContext(const char* buffer, int size)
239  : workBuffersize_(VIO_BUFFER_SZ),
240  workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
241  inputFile_(nullptr),
242  inputBuffer_(buffer),
243  inputBufferSize_(size) {
244  ctx_ = avio_alloc_context(
245  static_cast<unsigned char*>(workBuffer_.get()),
246  workBuffersize_,
247  0,
248  this,
249  &VideoIOContext::readMemory,
250  nullptr, // no write function
251  &VideoIOContext::seekMemory);
252  }
253 
254  ~VideoIOContext() {
255  av_free(ctx_);
256  if (inputFile_) {
257  fclose(inputFile_);
258  }
259  }
260 
261  int read(unsigned char* buf, int buf_size) {
262  if (inputBuffer_) {
263  return readMemory(this, buf, buf_size);
264  } else if (inputFile_) {
265  return readFile(this, buf, buf_size);
266  } else {
267  return -1;
268  }
269  }
270 
271  int64_t seek(int64_t offset, int whence) {
272  if (inputBuffer_) {
273  return seekMemory(this, offset, whence);
274  } else if (inputFile_) {
275  return seekFile(this, offset, whence);
276  } else {
277  return -1;
278  }
279  }
280 
281  static int readFile(void* opaque, unsigned char* buf, int buf_size) {
282  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
283  if (feof(h->inputFile_)) {
284  return AVERROR_EOF;
285  }
286  size_t ret = fread(buf, 1, buf_size, h->inputFile_);
287  if (ret < buf_size) {
288  if (ferror(h->inputFile_)) {
289  return -1;
290  }
291  }
292  return ret;
293  }
294 
295  static int64_t seekFile(void* opaque, int64_t offset, int whence) {
296  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
297  switch (whence) {
298  case SEEK_CUR: // from current position
299  case SEEK_END: // from eof
300  case SEEK_SET: // from beginning of file
301  return fseek(h->inputFile_, static_cast<long>(offset), whence);
302  break;
303  case AVSEEK_SIZE:
304  int64_t cur = ftell(h->inputFile_);
305  fseek(h->inputFile_, 0L, SEEK_END);
306  int64_t size = ftell(h->inputFile_);
307  fseek(h->inputFile_, cur, SEEK_SET);
308  return size;
309  }
310 
311  return -1;
312  }
313 
314  static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
315  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
316  if (buf_size < 0) {
317  return -1;
318  }
319 
320  int reminder = h->inputBufferSize_ - h->offset_;
321  int r = buf_size < reminder ? buf_size : reminder;
322  if (r < 0) {
323  return AVERROR_EOF;
324  }
325 
326  memcpy(buf, h->inputBuffer_ + h->offset_, r);
327  h->offset_ += r;
328  return r;
329  }
330 
331  static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
332  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
333  switch (whence) {
334  case SEEK_CUR: // from current position
335  h->offset_ += offset;
336  break;
337  case SEEK_END: // from eof
338  h->offset_ = h->inputBufferSize_ + offset;
339  break;
340  case SEEK_SET: // from beginning of file
341  h->offset_ = offset;
342  break;
343  case AVSEEK_SIZE:
344  return h->inputBufferSize_;
345  }
346  return h->offset_;
347  }
348 
349  AVIOContext* get_avio() {
350  return ctx_;
351  }
352 
353  private:
354  int workBuffersize_;
355  DecodedFrame::AvDataPtr workBuffer_;
356  // for file mode
357  FILE* inputFile_;
358 
359  // for memory mode
360  const char* inputBuffer_;
361  int inputBufferSize_;
362  int offset_ = 0;
363 
364  AVIOContext* ctx_;
365 };
366 
367 struct VideoMeta {
368  double fps;
369  int width;
370  int height;
371  enum AVMediaType codec_type;
372  AVPixelFormat pixFormat;
373  VideoMeta()
374  : fps(-1),
375  width(-1),
376  height(-1),
377  codec_type(AVMEDIA_TYPE_VIDEO),
378  pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
379 };
380 
382  public:
383  VideoDecoder();
384 
385  void decodeFile(
386  const std::string& filename,
387  const Params& params,
388  const int start_frm,
389  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
390 
391  void decodeMemory(
392  const char* buffer,
393  const int size,
394  const Params& params,
395  const int start_frm,
396  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
397 
398  private:
399  std::string ffmpegErrorStr(int result);
400 
401  void ResizeAndKeepAspectRatio(
402  const int origHeight,
403  const int origWidth,
404  const int heightMin,
405  const int widthMin,
406  int& outHeight,
407  int& outWidth);
408 
409  void decodeLoop(
410  const std::string& videoName,
411  VideoIOContext& ioctx,
412  const Params& params,
413  const int start_frm,
414  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
415 };
416 } // namespace caffe2
417 
418 #endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
Params & outputHeight(int height)
Output frame height, default to video height.
Params & keyFrames(bool keyFrames)
Return all key-frames.
Params & outputWidth(int width)
Output frame width, default to video width.
Params & streamIndex(int index)
Index of video stream to process, defaults to the first video stream.
Copyright (c) 2016-present, Facebook, Inc.
Params & fps(float v)
FPS of output frames setting here will reset intervals_ and force decoding at target FPS This can be ...
Params & pixelFormat(AVPixelFormat pixelFormat)
Pixel format of output buffer, default PIX_FMT_RGB24.
Params & maxOutputFrames(int count)
Only output this many frames, default to no limit.