Caffe2 - C++ API
A deep learning, cross platform ML framework
recurrent_network_executor_gpu.cc
1 
17 #include "caffe2/operators/recurrent_network_executor_gpu.h"
18 
19 #include "caffe2/core/context_gpu.h"
20 
21 namespace caffe2 {
22 
23 template <>
24 std::unique_ptr<RecurrentNetworkExecutorBase> createRNNExecutor<CUDAContext>(
25  const NetDef& step_net_def,
26  std::map<string, string>& recurrent_input_map,
27  std::string timestep_blob,
28  ArgumentHelper arg_helper) {
29  auto* exec = new CUDARecurrentNetworkExecutor(
30  step_net_def, recurrent_input_map, timestep_blob);
31  int max_streams = arg_helper.GetSingleArgument<int>("rnn_executor.max_cuda_streams", 0);
32  if (max_streams > 0) {
33  exec->setMaxStreams(max_streams);
34  LOG(INFO) << "Set max streams:" << max_streams;
35  }
36  std::unique_ptr<RecurrentNetworkExecutorBase> ptr(exec);
37  return ptr;
38 }
39 
40 CUDARecurrentNetworkExecutor::~CUDARecurrentNetworkExecutor() {
41  for (cudaEvent_t ev : events_) {
42  if (ev != nullptr) {
43  CUDA_CHECK(cudaEventDestroy(ev));
44  }
45  }
46 }
47 
54 void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
55  int direction = to > from ? 1 : -1;
56 
57  int max_streams = max_parallel_timesteps_ > 0 ?
58  std::min(max_parallel_timesteps_, max_cuda_streams_)
59  : max_cuda_streams_;
60  int stream_seq = 0;
61  int num_ops = timestep_ops_[0].size();
62 
63  events_.resize(num_ops * timestep_ops_.size(), nullptr);
64 
65  int gpu_id = -1;
66 
67  // Loop over timesteps
68  for (int t = from; t != to; t += direction) {
69  bool first_timestep = t == from;
70  bool last_timestep =
71  (direction == -1 && t == 0) || (direction == 1 && t == to - 1);
72  auto& ops = timestep_ops_[t];
73  int stream_id = stream_seq % max_streams;
74 
75  for (int i = 0; i < ops.size(); i++) {
76  auto& rnn_op = ops[i];
77 
78  // Special handling for link ops -- we just run them directly
79  // they do not execute any kernels.
80  if (rnn_op.link_op) {
81  rnn_op.op->RunAsync(stream_id);
82  CAFFE_ENFORCE(
83  rnn_op.dependencies.empty(),
84  "GPU executor ignores link dependencies");
85  continue;
86  }
87 
88  if (gpu_id == -1 && rnn_op.op->device_option().device_type() == 1) {
89  gpu_id = rnn_op.op->device_option().cuda_gpu_id();
90  } else {
91  CAFFE_ENFORCE(
92  rnn_op.op->device_option().device_type() == 0 ||
93  rnn_op.op->device_option().cuda_gpu_id() == gpu_id,
94  "RNN Executor only supports ops on one GPU");
95  }
96 
97  // If have recurrent parents, add for event waits so that those
98  // parents complete their work.
99  if (has_timestep_parallelism_ && !first_timestep) {
100  for (int parent : rnn_op.parents) {
101  if (parent > i) {
102  int parent_ev_idx = (t - direction) * num_ops + parent;
103  CHECK(events_.size() > parent_ev_idx);
104  CAFFE_ENFORCE(events_[parent_ev_idx] != nullptr);
105  CUDA_CHECK(cudaStreamWaitEvent(
106  CUDAContext::cuda_stream(gpu_id, stream_id),
107  events_[parent_ev_idx],
108  0));
109  }
110  }
111  }
112 
113  // Run the op in the given stream
114  rnn_op.op->RunAsync(stream_id);
115 
116  // Create and record event for this op, if it has at least one
117  // recurrent dependency.
118  if (has_timestep_parallelism_ && !last_timestep) {
119  for (int dep : rnn_op.dependencies) {
120  if (dep < i) {
121  int event_idx = t * num_ops + i;
122  // Create event for recurrent connections
123  if (events_[event_idx] == nullptr) {
124  CUDA_CHECK(cudaEventCreate(&events_[event_idx]));
125  }
126  CUDA_CHECK(cudaEventRecord(
127  events_[event_idx],
128  CUDAContext::cuda_stream(gpu_id, stream_id)));
129  break;
130  }
131  }
132  }
133  } // for over ops
134 
135  // Next timestep will run on different stream
136  if (has_timestep_parallelism_) {
137  stream_seq++;
138  }
139  } // for over timesteps
140 
144  for (int stream_id = 0; stream_id <= std::min(stream_seq, max_streams - 1);
145  stream_id++) {
146  VLOG(1) << "Wait for stream:" << stream_id;
147  CUDA_CHECK(
148  cudaStreamSynchronize(CUDAContext::cuda_stream(gpu_id, stream_id)));
149  }
150 }
151 
152 bool CUDARecurrentNetworkExecutor::Run(int T) {
153  _ExecRange(0, T);
154  return true;
155 }
156 
157 bool CUDARecurrentNetworkExecutor::RunBackwards(int T) {
158  _ExecRange(T - 1, -1);
159  return true;
160 }
161 }
Copyright (c) 2016-present, Facebook, Inc.