Caffe2 - C++ API
A deep learning, cross platform ML framework
video_decoder.h
1 #ifndef CAFFE2_VIDEO_VIDEO_DECODER_H_
2 #define CAFFE2_VIDEO_VIDEO_DECODER_H_
3 
4 #include <caffe2/core/logging.h>
5 #include <stdio.h>
6 #include <memory>
7 #include <string>
8 #include <vector>
9 
10 extern "C" {
11 #include <libavformat/avformat.h>
12 #include <libavformat/avio.h>
13 }
14 
15 namespace caffe2 {
16 
17 #define VIO_BUFFER_SZ 32768
18 #define MAX_DECODING_FRAMES 10000
19 
20 // enum to specify 3 special fps sampling behaviors:
21 // 0: disable fps sampling, no frame sampled at all
22 // -1: unlimited fps sampling, will sample at native video fps
23 // -2: disable fps sampling, but will get the frame at specific timestamp
24 enum SpecialFps {
25  SAMPLE_NO_FRAME = 0,
26  SAMPLE_ALL_FRAMES = -1,
27  SAMPLE_TIMESTAMP_ONLY = -2,
28 };
29 
30 // three different types of resolution when decoding the video
31 // 0: resize to width x height and ignore the aspect ratio;
32 // 1: resize to make size at least (width x height) and keep the aspect ratio;
33 // 2: using the original resolution of the video; if resolution
34 // is smaller than crop_height x crop_width, resize to ensure
35 // new height >= crop_height and new width >= crop_width
36 // and keep the aspect ratio;
37 enum VideoResType {
38  USE_WIDTH_HEIGHT = 0,
39  USE_MINIMAL_WIDTH_HEIGHT = 1,
40  ORIGINAL_RES = 2,
41 };
42 
43 // three different types of decoding behavior are supported
44 // 0: do temporal jittering to sample a random clip from the video
45 // 1: sample a clip from a given starting frame
46 // 2: uniformly sample multiple clips from the video;
47 enum DecodeType {
48  DO_TMP_JITTER = 0,
49  DO_UNIFORM_SMP = 1,
50  USE_START_FRM = 2,
51 };
52 
53 // sampling interval for fps starting at specified timestamp
54 // use enum SpecialFps to set special fps decoding behavior
55 // note sampled fps will not always accurately follow the target fps,
56 // because sampled frame has to snap to actual frame timestamp,
57 // e.g. video fps = 25, sample fps = 4 will sample every 0.28s, not 0.25
58 // video fps = 25, sample fps = 5 will sample every 0.24s, not 0.2,
59 // because of floating-point division accuracy (1 / 5.0 is not exactly 0.2)
61  double timestamp;
62  double fps;
63  SampleInterval() : timestamp(-1), fps(SpecialFps::SAMPLE_ALL_FRAMES) {}
64  SampleInterval(double ts, double f) : timestamp(ts), fps(f) {}
65  bool operator<(const SampleInterval& itvl) const {
66  return (timestamp < itvl.timestamp);
67  }
68 };
69 
70 class Params {
71  public:
72  // return all key-frames regardless of specified fps
73  bool keyFrames_ = false;
74 
75  // Output image pixel format
76  AVPixelFormat pixelFormat_ = AVPixelFormat::AV_PIX_FMT_RGB24;
77 
78  // Index of stream to decode.
79  // -1 will automatically decode the first video stream.
80  int streamIndex_ = -1;
81 
82  // How many frames to output at most from the video
83  // -1 no limit
84  int maximumOutputFrames_ = -1;
85 
86  // params for video resolution
87  int video_res_type_ = VideoResType::USE_WIDTH_HEIGHT;
88 
89  // the size of the patch croped from the input video
90  int crop_height_ = -1;
91  int crop_width_ = -1;
92 
93  // minimal resolution for resizing when using USE_MINIMAL_WIDTH_HEIGHT
94  int height_min_ = -1;
95  int width_min_ = -1;
96 
97  // the video resolution after resizing
98  int scale_w_ = -1;
99  int scale_h_ = -1;
100 
101  // params for decoding behavior
102  int decode_type_ = DecodeType::DO_TMP_JITTER;
103  int num_of_required_frame_ = -1;
104 
105  // intervals_ control variable sampling fps between different timestamps
106  // intervals_ must be ordered strictly ascending by timestamps
107  // the first interval must have a timestamp of zero
108  // fps must be either the 3 special fps defined in SpecialFps, or > 0
109  std::vector<SampleInterval> intervals_ = {{0, SpecialFps::SAMPLE_ALL_FRAMES}};
110 
111  Params() {}
112 
118  Params& fps(float v) {
119  intervals_.clear();
120  intervals_.emplace_back(0, v);
121  return *this;
122  }
123 
130  Params& setSampleTimestamps(const std::vector<double>& timestamps) {
131  intervals_.clear();
132  // insert an interval per desired frame.
133  for (auto& timestamp : timestamps) {
134  intervals_.emplace_back(timestamp, SpecialFps::SAMPLE_TIMESTAMP_ONLY);
135  }
136  return *this;
137  }
138 
142  Params& pixelFormat(AVPixelFormat pixelFormat) {
143  pixelFormat_ = pixelFormat;
144  return *this;
145  }
146 
150  Params& keyFrames(bool keyFrames) {
151  keyFrames_ = keyFrames;
152  return *this;
153  }
154 
158  Params& streamIndex(int index) {
159  streamIndex_ = index;
160  return *this;
161  }
162 
166  Params& maxOutputFrames(int count) {
167  maximumOutputFrames_ = count;
168  return *this;
169  }
170 
174  Params& outputWidth(int width) {
175  scale_w_ = width;
176  return *this;
177  }
178 
182  Params& outputHeight(int height) {
183  scale_h_ = height;
184  return *this;
185  }
186 };
187 
188 // data structure for storing decoded video frames
190  public:
191  struct avDeleter {
192  void operator()(unsigned char* p) const {
193  av_free(p);
194  }
195  };
196  using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
197 
198  // decoded data buffer
199  AvDataPtr data_;
200 
201  // size in bytes
202  int size_ = 0;
203 
204  // frame dimensions
205  int width_ = 0;
206  int height_ = 0;
207 
208  // timestamp in seconds since beginning of video
209  double timestamp_ = 0;
210 
211  // true if this is a key frame.
212  bool keyFrame_ = false;
213 
214  // index of frame in video
215  int index_ = -1;
216 
217  // Sequential number of outputted frame
218  int outputFrameIndex_ = -1;
219 };
220 
222  public:
223  explicit VideoIOContext(const std::string& fname)
224  : workBuffersize_(VIO_BUFFER_SZ),
225  workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
226  inputFile_(nullptr),
227  inputBuffer_(nullptr),
228  inputBufferSize_(0) {
229  inputFile_ = fopen(fname.c_str(), "rb");
230  if (inputFile_ == nullptr) {
231  LOG(ERROR) << "Error opening video file " << fname;
232  }
233  ctx_ = avio_alloc_context(
234  static_cast<unsigned char*>(workBuffer_.get()),
235  workBuffersize_,
236  0,
237  this,
238  &VideoIOContext::readFile,
239  nullptr, // no write function
240  &VideoIOContext::seekFile);
241  }
242 
243  explicit VideoIOContext(const char* buffer, int size)
244  : workBuffersize_(VIO_BUFFER_SZ),
245  workBuffer_((uint8_t*)av_malloc(workBuffersize_)),
246  inputFile_(nullptr),
247  inputBuffer_(buffer),
248  inputBufferSize_(size) {
249  ctx_ = avio_alloc_context(
250  static_cast<unsigned char*>(workBuffer_.get()),
251  workBuffersize_,
252  0,
253  this,
254  &VideoIOContext::readMemory,
255  nullptr, // no write function
256  &VideoIOContext::seekMemory);
257  }
258 
259  ~VideoIOContext() {
260  av_free(ctx_);
261  if (inputFile_) {
262  fclose(inputFile_);
263  }
264  }
265 
266  int read(unsigned char* buf, int buf_size) {
267  if (inputBuffer_) {
268  return readMemory(this, buf, buf_size);
269  } else if (inputFile_) {
270  return readFile(this, buf, buf_size);
271  } else {
272  return -1;
273  }
274  }
275 
276  int64_t seek(int64_t offset, int whence) {
277  if (inputBuffer_) {
278  return seekMemory(this, offset, whence);
279  } else if (inputFile_) {
280  return seekFile(this, offset, whence);
281  } else {
282  return -1;
283  }
284  }
285 
286  static int readFile(void* opaque, unsigned char* buf, int buf_size) {
287  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
288  if (feof(h->inputFile_)) {
289  return AVERROR_EOF;
290  }
291  size_t ret = fread(buf, 1, buf_size, h->inputFile_);
292  if (ret < buf_size) {
293  if (ferror(h->inputFile_)) {
294  return -1;
295  }
296  }
297  return ret;
298  }
299 
300  static int64_t seekFile(void* opaque, int64_t offset, int whence) {
301  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
302  switch (whence) {
303  case SEEK_CUR: // from current position
304  case SEEK_END: // from eof
305  case SEEK_SET: // from beginning of file
306  return fseek(h->inputFile_, static_cast<long>(offset), whence);
307  break;
308  case AVSEEK_SIZE:
309  int64_t cur = ftell(h->inputFile_);
310  fseek(h->inputFile_, 0L, SEEK_END);
311  int64_t size = ftell(h->inputFile_);
312  fseek(h->inputFile_, cur, SEEK_SET);
313  return size;
314  }
315 
316  return -1;
317  }
318 
319  static int readMemory(void* opaque, unsigned char* buf, int buf_size) {
320  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
321  if (buf_size < 0) {
322  return -1;
323  }
324 
325  int reminder = h->inputBufferSize_ - h->offset_;
326  int r = buf_size < reminder ? buf_size : reminder;
327  if (r < 0) {
328  return AVERROR_EOF;
329  }
330 
331  memcpy(buf, h->inputBuffer_ + h->offset_, r);
332  h->offset_ += r;
333  return r;
334  }
335 
336  static int64_t seekMemory(void* opaque, int64_t offset, int whence) {
337  VideoIOContext* h = static_cast<VideoIOContext*>(opaque);
338  switch (whence) {
339  case SEEK_CUR: // from current position
340  h->offset_ += offset;
341  break;
342  case SEEK_END: // from eof
343  h->offset_ = h->inputBufferSize_ + offset;
344  break;
345  case SEEK_SET: // from beginning of file
346  h->offset_ = offset;
347  break;
348  case AVSEEK_SIZE:
349  return h->inputBufferSize_;
350  }
351  return h->offset_;
352  }
353 
354  AVIOContext* get_avio() {
355  return ctx_;
356  }
357 
358  private:
359  int workBuffersize_;
360  DecodedFrame::AvDataPtr workBuffer_;
361  // for file mode
362  FILE* inputFile_;
363 
364  // for memory mode
365  const char* inputBuffer_;
366  int inputBufferSize_;
367  int offset_ = 0;
368 
369  AVIOContext* ctx_;
370 };
371 
372 struct VideoMeta {
373  double fps;
374  int width;
375  int height;
376  enum AVMediaType codec_type;
377  AVPixelFormat pixFormat;
378  VideoMeta()
379  : fps(-1),
380  width(-1),
381  height(-1),
382  codec_type(AVMEDIA_TYPE_VIDEO),
383  pixFormat(AVPixelFormat::AV_PIX_FMT_RGB24) {}
384 };
385 
387  public:
388  VideoDecoder();
389 
390  void decodeFile(
391  const std::string& filename,
392  const Params& params,
393  const int start_frm,
394  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
395 
396  void decodeMemory(
397  const char* buffer,
398  const int size,
399  const Params& params,
400  const int start_frm,
401  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
402 
403  private:
404  std::string ffmpegErrorStr(int result);
405 
406  void ResizeAndKeepAspectRatio(
407  const int origHeight,
408  const int origWidth,
409  const int heightMin,
410  const int widthMin,
411  int& outHeight,
412  int& outWidth);
413 
414  void decodeLoop(
415  const std::string& videoName,
416  VideoIOContext& ioctx,
417  const Params& params,
418  const int start_frm,
419  std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames);
420 };
421 } // namespace caffe2
422 
423 #endif // CAFFE2_VIDEO_VIDEO_DECODER_H_
Params & outputHeight(int height)
Output frame height, default to video height.
Params & keyFrames(bool keyFrames)
Return all key-frames.
Params & outputWidth(int width)
Output frame width, default to video width.
Params & streamIndex(int index)
Index of video stream to process, defaults to the first video stream.
Params & setSampleTimestamps(const std::vector< double > &timestamps)
Sample output frames at a specified list of timestamps Timestamps must be in increasing order...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Params & fps(float v)
FPS of output frames setting here will reset intervals_ and force decoding at target FPS This can be ...
Params & pixelFormat(AVPixelFormat pixelFormat)
Pixel format of output buffer, default PIX_FMT_RGB24.
Params & maxOutputFrames(int count)
Only output this many frames, default to no limit.