1 #include <caffe2/video/video_decoder.h> 2 #include <caffe2/core/logging.h> 9 #include <libavcodec/avcodec.h> 10 #include <libavformat/avformat.h> 11 #include <libavutil/log.h> 12 #include <libswresample/swresample.h> 13 #include <libswscale/swscale.h> 18 VideoDecoder::VideoDecoder() {
19 static bool gInitialized =
false;
20 static std::mutex gMutex;
21 std::unique_lock<std::mutex> lock(gMutex);
24 avcodec_register_all();
25 avformat_network_init();
30 void VideoDecoder::ResizeAndKeepAspectRatio(
37 float min_aspect = (float)heightMin / (
float)widthMin;
38 float video_aspect = (float)origHeight / (
float)origWidth;
39 if (video_aspect >= min_aspect) {
41 outHeight = (int)ceil(video_aspect * outWidth);
43 outHeight = heightMin;
44 outWidth = (int)ceil(outHeight / video_aspect);
48 void VideoDecoder::decodeLoop(
49 const string& videoName,
50 VideoIOContext& ioctx,
53 std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
54 AVPixelFormat pixFormat = params.pixelFormat_;
55 AVFormatContext* inputContext = avformat_alloc_context();
56 AVStream* videoStream_ =
nullptr;
57 AVCodecContext* videoCodecContext_ =
nullptr;
58 AVFrame* videoStreamFrame_ =
nullptr;
60 av_init_packet(&packet);
61 SwsContext* scaleContext_ =
nullptr;
64 inputContext->pb = ioctx.get_avio();
65 inputContext->flags |= AVFMT_FLAG_CUSTOM_IO;
69 int probeSz = 1 * 1024 + AVPROBE_PADDING_SIZE;
70 DecodedFrame::AvDataPtr probe((uint8_t*)av_malloc(probeSz));
72 memset(probe.get(), 0, probeSz);
73 int len = ioctx.read(probe.get(), probeSz - AVPROBE_PADDING_SIZE);
74 if (len < probeSz - AVPROBE_PADDING_SIZE) {
75 LOG(ERROR) <<
"Insufficient data to determine video format";
80 ioctx.seek(0, SEEK_SET);
82 unique_ptr<AVProbeData> probeData(
new AVProbeData());
83 probeData->buf = probe.get();
84 probeData->buf_size = len;
85 probeData->filename =
"";
87 inputContext->iformat = av_probe_input_format(probeData.get(), 1);
89 ret = avformat_open_input(&inputContext,
"",
nullptr,
nullptr);
91 LOG(ERROR) <<
"Unable to open stream " << ffmpegErrorStr(ret);
95 ret = avformat_find_stream_info(inputContext,
nullptr);
97 LOG(ERROR) <<
"Unable to find stream info in " << videoName <<
" " 98 << ffmpegErrorStr(ret);
103 int videoStreamIndex_ = params.streamIndex_;
104 if (videoStreamIndex_ == -1) {
105 for (
int i = 0; i < inputContext->nb_streams; i++) {
106 auto stream = inputContext->streams[i];
107 if (stream->codec->codec_type == AVMEDIA_TYPE_VIDEO) {
108 videoStreamIndex_ = i;
109 videoStream_ = stream;
115 if (videoStream_ ==
nullptr) {
116 LOG(ERROR) <<
"Unable to find video stream in " << videoName <<
" " 117 << ffmpegErrorStr(ret);
122 AVDictionary* opts =
nullptr;
123 videoCodecContext_ = videoStream_->codec;
127 avcodec_find_decoder(videoCodecContext_->codec_id),
129 }
catch (
const std::exception&) {
130 LOG(ERROR) <<
"Exception during open video codec";
135 LOG(ERROR) <<
"Cannot open video codec : " 136 << videoCodecContext_->codec->name;
141 int origWidth = videoCodecContext_->width;
142 int origHeight = videoCodecContext_->height;
143 int outWidth = origWidth;
144 int outHeight = origHeight;
146 if (params.video_res_type_ == VideoResType::ORIGINAL_RES) {
149 if (params.crop_width_ > origWidth || params.crop_height_ > origHeight) {
150 ResizeAndKeepAspectRatio(
159 params.video_res_type_ == VideoResType::USE_MINIMAL_WIDTH_HEIGHT) {
162 ResizeAndKeepAspectRatio(
169 }
else if (params.video_res_type_ == VideoResType::USE_WIDTH_HEIGHT) {
172 outWidth = params.scale_w_;
173 outHeight = params.scale_h_;
175 LOG(ERROR) <<
"Unknown video_res_type: " << params.video_res_type_;
179 CAFFE_ENFORCE_NE(videoCodecContext_->pix_fmt, AV_PIX_FMT_NONE);
182 scaleContext_ = sws_getContext(
183 videoCodecContext_->width,
184 videoCodecContext_->height,
185 videoCodecContext_->pix_fmt,
196 videoMeta.codec_type = videoCodecContext_->codec_type;
197 videoMeta.width = outWidth;
198 videoMeta.height = outHeight;
199 videoMeta.pixFormat = pixFormat;
200 videoMeta.fps = av_q2d(videoStream_->avg_frame_rate);
203 if (sampledFrames.size() > 0) {
204 sampledFrames.clear();
207 if (params.intervals_.size() == 0) {
208 LOG(ERROR) <<
"Empty sampling intervals.";
212 std::vector<SampleInterval>::const_iterator itvlIter =
213 params.intervals_.begin();
214 if (itvlIter->timestamp != 0) {
215 LOG(ERROR) <<
"Sampling interval starting timestamp is not zero.";
218 double currFps = itvlIter->fps;
219 if (currFps < 0 && currFps != SpecialFps::SAMPLE_ALL_FRAMES &&
220 currFps != SpecialFps::SAMPLE_TIMESTAMP_ONLY) {
222 LOG(ERROR) <<
"Invalid sampling fps.";
225 double prevTimestamp = itvlIter->timestamp;
227 if (itvlIter != params.intervals_.end() &&
228 prevTimestamp >= itvlIter->timestamp) {
229 LOG(ERROR) <<
"Sampling interval timestamps must be strictly ascending.";
232 double lastFrameTimestamp = -1.0;
235 videoStreamFrame_ = av_frame_alloc();
240 int outputFrameIndex = -1;
243 std::mt19937 meta_randgen(time(
nullptr));
244 long int start_ts = -1;
245 bool mustDecodeAll =
false;
246 if (videoStream_->duration > 0 && videoStream_->nb_frames > 0) {
253 int(ceil((10 * videoStream_->duration) / (videoStream_->nb_frames)));
255 if (params.decode_type_ == DecodeType::DO_TMP_JITTER) {
257 double maxFramesDuration =
258 (videoStream_->duration * params.num_of_required_frame_) /
259 (videoStream_->nb_frames);
261 int ts2 = videoStream_->duration - int(ceil(maxFramesDuration));
262 ts2 = ts2 > 0 ? ts2 : 0;
265 start_ts = std::uniform_int_distribution<>(ts1, ts2)(meta_randgen);
270 0 > (start_ts - margin) ? 0 : (start_ts - margin),
271 AVSEEK_FLAG_BACKWARD);
274 }
else if (params.decode_type_ == DecodeType::USE_START_FRM) {
275 start_ts = int(floor(
276 (videoStream_->duration * start_frm) / (videoStream_->nb_frames)));
281 0 > (start_ts - margin) ? 0 : (start_ts - margin),
282 AVSEEK_FLAG_BACKWARD);
284 mustDecodeAll =
true;
288 LOG(ERROR) <<
"Unable to decode from a random start point";
290 av_seek_frame(inputContext, videoStreamIndex_, 0, AVSEEK_FLAG_BACKWARD);
291 mustDecodeAll =
true;
296 LOG(INFO) <<
" Decoding all frames as we do not have suffiecient" 297 " metadata for selective decoding.";
298 mustDecodeAll =
true;
303 int selectiveDecodedFrames = 0;
305 int maxFrames = (params.decode_type_ == DecodeType::DO_UNIFORM_SMP)
306 ? MAX_DECODING_FRAMES
307 : params.num_of_required_frame_;
313 while ((!eof || gotPicture) &&
317 ((!mustDecodeAll) && (selectiveDecodedFrames < maxFrames))) &&
320 !((itvlIter == params.intervals_.end() &&
321 (currFps == SpecialFps::SAMPLE_TIMESTAMP_ONLY ||
322 currFps == SpecialFps::SAMPLE_NO_FRAME)) &&
323 !params.keyFrames_)) {
326 ret = av_read_frame(inputContext, &packet);
327 if (ret == AVERROR_EOF) {
329 av_free_packet(&packet);
330 packet.data =
nullptr;
333 }
else if (ret == AVERROR(EAGAIN)) {
334 av_free_packet(&packet);
336 }
else if (ret < 0) {
337 LOG(ERROR) <<
"Error reading packet : " << ffmpegErrorStr(ret);
342 if (packet.stream_index != videoStreamIndex_) {
343 av_free_packet(&packet);
348 ret = avcodec_decode_video2(
349 videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
351 LOG(ERROR) <<
"Error decoding video frame : " << ffmpegErrorStr(ret);
357 av_free_packet(&packet);
363 av_frame_get_best_effort_timestamp(videoStreamFrame_);
364 double timestamp = frame_ts * av_q2d(videoStream_->time_base);
366 if ((frame_ts >= start_ts && !mustDecodeAll) || mustDecodeAll) {
376 if (itvlIter != params.intervals_.end() &&
377 timestamp >= itvlIter->timestamp) {
378 lastFrameTimestamp = -1.0;
379 currFps = itvlIter->fps;
380 prevTimestamp = itvlIter->timestamp;
382 if (itvlIter != params.intervals_.end() &&
383 prevTimestamp >= itvlIter->timestamp) {
385 <<
"Sampling interval timestamps must be strictly ascending.";
390 bool keyFrame = params.keyFrames_ && videoStreamFrame_->key_frame;
393 if (currFps == SpecialFps::SAMPLE_NO_FRAME) {
394 av_free_packet(&packet);
414 bool fpsReached = lastFrameTimestamp < 0 ||
415 currFps == SpecialFps::SAMPLE_ALL_FRAMES ||
417 timestamp >= lastFrameTimestamp + (1 / currFps));
420 av_free_packet(&packet);
425 lastFrameTimestamp = timestamp;
428 if (params.maximumOutputFrames_ != -1 &&
429 outputFrameIndex >= params.maximumOutputFrames_) {
431 av_free_packet(&packet);
435 AVFrame* rgbFrame = av_frame_alloc();
437 LOG(ERROR) <<
"Error allocating AVframe";
442 int numBytes = avpicture_get_size(pixFormat, outWidth, outHeight);
443 DecodedFrame::AvDataPtr buffer(
444 (uint8_t*)av_malloc(numBytes *
sizeof(uint8_t)));
446 int size = avpicture_fill(
447 (AVPicture*)rgbFrame,
455 videoStreamFrame_->data,
456 videoStreamFrame_->linesize,
458 videoCodecContext_->height,
462 unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
463 frame->width_ = outWidth;
464 frame->height_ = outHeight;
465 frame->data_ = move(buffer);
467 frame->index_ = frameIndex;
468 frame->outputFrameIndex_ = outputFrameIndex;
469 frame->timestamp_ = timestamp;
470 frame->keyFrame_ = videoStreamFrame_->key_frame;
472 sampledFrames.push_back(move(frame));
473 selectiveDecodedFrames++;
474 av_frame_free(&rgbFrame);
475 }
catch (
const std::exception&) {
476 av_frame_free(&rgbFrame);
479 av_frame_unref(videoStreamFrame_);
480 }
catch (
const std::exception&) {
481 av_frame_unref(videoStreamFrame_);
484 av_free_packet(&packet);
485 }
catch (
const std::exception&) {
486 av_free_packet(&packet);
491 sws_freeContext(scaleContext_);
492 av_packet_unref(&packet);
493 av_frame_free(&videoStreamFrame_);
494 avcodec_close(videoCodecContext_);
495 avformat_close_input(&inputContext);
496 avformat_free_context(inputContext);
497 }
catch (
const std::exception&) {
500 sws_freeContext(scaleContext_);
501 av_packet_unref(&packet);
502 av_frame_free(&videoStreamFrame_);
503 avcodec_close(videoCodecContext_);
504 avformat_close_input(&inputContext);
505 avformat_free_context(inputContext);
509 void VideoDecoder::decodeMemory(
512 const Params& params,
514 std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
515 VideoIOContext ioctx(buffer, size);
516 decodeLoop(
string(
"Memory Buffer"), ioctx, params, start_frm, sampledFrames);
519 void VideoDecoder::decodeFile(
521 const Params& params,
523 std::vector<std::unique_ptr<DecodedFrame>>& sampledFrames) {
524 VideoIOContext ioctx(file);
525 decodeLoop(file, ioctx, params, start_frm, sampledFrames);
528 string VideoDecoder::ffmpegErrorStr(
int result) {
529 std::array<char, 128> buf;
530 av_strerror(result, buf.data(), buf.size());
531 return string(buf.data());
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...