Caffe2 - C++ API
A deep learning, cross platform ML framework
compiler.cpp
1 #include <torch/csrc/jit/fuser/compiler.h>
2 
3 #include <ATen/ATen.h>
4 #include <ATen/core/jit_type.h>
5 #include <c10/util/Exception.h>
6 #include <torch/csrc/jit/code_template.h>
7 #include <torch/csrc/jit/fuser/codegen.h>
8 #include <torch/csrc/jit/fuser/interface.h>
9 #include <torch/csrc/jit/fuser/kernel_cache.h>
10 #include <torch/csrc/jit/fuser/tensor_desc.h>
11 #include <torch/csrc/jit/ir.h>
12 #include <torch/csrc/jit/operator.h>
13 #include <torch/csrc/jit/passes/canonicalize.h>
14 #include <torch/csrc/jit/passes/graph_fuser.h>
15 #include <torch/csrc/jit/passes/shape_analysis.h>
16 
17 #include <atomic>
18 #include <iostream>
19 #include <memory>
20 #include <sstream>
21 #include <stdexcept>
22 #include <string>
23 #include <tuple>
24 #include <unordered_set>
25 #include <utility>
26 
27 namespace torch {
28 namespace jit {
29 namespace fuser {
30 
31 std::mutex fusion_backends_lock_;
32 static std::unordered_map<at::Device::Type, FusedKernelConstructor>&
33 getFusionBackends() {
34  static std::unordered_map<at::Device::Type, FusedKernelConstructor>
35  fusion_backends;
36  return fusion_backends;
37 }
38 
39 void registerFusionBackend(
40  at::Device::Type backend_type,
41  FusedKernelConstructor ctor) {
42  std::lock_guard<std::mutex> guard(fusion_backends_lock_);
43  getFusionBackends()[backend_type] = std::move(ctor);
44 }
45 
46 bool hasFusionBackend(at::Device::Type backend_type) {
47  std::lock_guard<std::mutex> guard(fusion_backends_lock_);
48  return getFusionBackends().count(backend_type);
49 }
50 
51 const FusedKernelConstructor& getConstructor(at::Device::Type backend_type) {
52  std::lock_guard<std::mutex> guard(fusion_backends_lock_);
53  return getFusionBackends().at(backend_type);
54 }
55 
56 // Counter for number of kernels compiled, used for debugging and
57 // creating arbitrary kernel names.
58 static std::atomic<size_t> next_kernel_id{0};
59 static int debug_fusion{-1};
60 
61 size_t nCompiledKernels() {
62  return next_kernel_id.load();
63 }
64 
65 int debugFuser() {
66  if (debug_fusion < 0) {
67  const char* debug_env = getenv("PYTORCH_FUSION_DEBUG");
68  debug_fusion = debug_env ? atoi(debug_env) : 0;
69  }
70  return debug_fusion;
71 }
72 
73 // If the given node is used once by a chunk node, returns that node.
74 // Returns nullptr otherwise.
75 static const Node* usedInFusedChunk(const Value* input) {
76  const auto& uses = input->uses();
77  if (uses.size() == 1) {
78  const Node* user = uses[0].user;
79  if (user->kind() == prim::ConstantChunk) {
80  return user;
81  }
82  }
83  return nullptr;
84 }
85 
86 static void setInputChunkDescriptors(KernelSpec& spec) {
87  // We only have as many chunk descriptors as tensor inputs,
88  // furthermore we know that the tensor inputs are in the
89  // beginning of the fusion group's inputs.
90  spec.inputChunks().reserve(spec.nTensorInputs());
91  for (int64_t i = 0; i < spec.nTensorInputs(); i++) {
92  const Value* input = spec.graph()->inputs()[i];
93  if (const Node* chunk = usedInFusedChunk(input)) {
94  spec.inputChunks().emplace_back(
95  chunk->i(attr::chunks), chunk->i(attr::dim));
96  } else {
97  spec.inputChunks().emplace_back(1, 0);
98  }
99  }
100 }
101 
102 // Run a DFS traversal to find all inputs that affect a given output value
103 static std::vector<int64_t> getInputDependencies(const Value* output) {
104  std::vector<const Value*> queue{output};
105  std::unordered_set<const Value*> inputs;
106  std::unordered_set<const Value*> seen;
107  while (!queue.empty()) {
108  const Value* val = queue.back();
109  queue.pop_back();
110  const Node* producer = val->node();
111  // Here we assume that only tensor inputs are used in
112  // the computation of the outputs.
113  // This is currently true, as the only inputs will be
114  // sizes (for _grad_sum_to_size as the derivative
115  // of broadcasts), which will only be used after
116  // the fusion kernel, and Tensors.
117  // This needs to be revisited when you start allowing
118  // other things e.g. nonconstant scalars.
119  if (producer->kind() == prim::Param &&
120  val->type()->isSubtypeOf(TensorType::get())) {
121  inputs.insert(val);
122  continue;
123  }
124  for (const Value* input : producer->inputs()) {
125  if (/*bool inserted = */ seen.insert(input).second) {
126  queue.push_back(input);
127  }
128  }
129  }
130 
131  // Convert Value* into offsets into the graph's input list
132  std::vector<int64_t> offsets;
133  offsets.reserve(inputs.size());
134  for (const Value* input : inputs) {
135  offsets.push_back(input->offset());
136  }
137 
138  std::sort(offsets.begin(), offsets.end());
139  return offsets;
140 }
141 
142 static void setInputBroadcastGroups(KernelSpec& spec) {
143  std::unordered_set<std::vector<int64_t>, torch::hash<std::vector<int64_t>>>
144  broadcast_groups;
145  for (const Value* output : (spec.graph())->outputs()) {
146  if (output->node()->kind() == prim::FusedConcat) {
147  for (const Value* concat_input : output->node()->inputs()) {
148  broadcast_groups.insert(getInputDependencies(concat_input));
149  }
150  } else {
151  broadcast_groups.insert(getInputDependencies(output));
152  }
153  }
154  std::copy(
155  broadcast_groups.begin(),
156  broadcast_groups.end(),
157  std::back_inserter(spec.inputBroadcastGroups()));
158 }
159 
160 // This function moves _grad_sum_to_size nodes along the computation graph
161 // of the fusion group to the outputs and then records the shape inputs
162 // in order for summation to be applied after the kernel.
163 // Note that the correctness relies on the invariant that
164 // _grad_sum_to_size is only applied to gradient nodes created by autodiff.
165 // This is important because it ensures that in the mul and div nodes only
166 // one argument (in the case of div the numerator) has a summed value.
167 // If two arguments to mul had one, we would be in trouble, but thanks
168 // to the chain rule, we're OK.
169 // Note that this means that one kernel output may lead to several fusion
170 // group outputs when several outputs had the same calculation except
171 // for the final _grad_sum_to_size. This is also the reason why
172 // we need to deduplicate kernel outputs at the end of this function.
173 void processGradSumToSize(KernelSpec& spec) {
174  auto graph = spec.graph();
175 
176  std::vector<int64_t> outputGradSumToSizes(graph->outputs().size(), -1);
177 
178  // these are expressions that might occur during autotdiff operating
179  // on the gradient (matmul would likely be, too but we don't fuse it)
180  // note that for mul, we know (from the chain rule) that only one
181  // factor will be stemming from a calculation involving gradients so
182  // we know that we can move _grad_sum_to_size across it
183  // Scan the graph. We will delete nodes. We want later (in the graph)
184  // _grad_sum_to_size nodes to have priority over earlier ones. Thus
185  // we scan backwards.
186  for (auto it = graph->nodes().rbegin(); it != graph->nodes().rend(); it++) {
187  auto* node = *it;
188  if (node->kind() != aten::_grad_sum_to_size) {
189  continue;
190  }
191  bool success = trackSingleGradSumToSizeToOutputs(
192  node->output(), &outputGradSumToSizes);
193  AT_ASSERT(success); // check that we didn't hit anything unknown
194 
195  // remove the GradSumToSize node, a new node outside the fusion graph
196  // will be inserted below
197  node->output()->replaceAllUsesWith(node->inputs()[0]);
198  it.destroyCurrent();
199  }
200 
201  // By removing the _grad_sum_to_size notes, we might end up with
202  // duplicate outputs, e.g. when having the autodiff backwards of
203  // x + y + z of something with x, y, z, those will have different
204  // _grad_sum_to_sizes but of the same kernel output.
205 
206  // for each fusion group output, record the corresponding kernel
207  // output and possibly a _grad_sum_to_size for that output
208  auto& outputMapAndSizes = spec.outputMapAndSizes();
209  AT_ASSERT(outputMapAndSizes.empty());
210  std::unordered_map<const Value*, int64_t> reduced_output_indices;
211  int64_t newo = 0;
212  for (auto osize : outputGradSumToSizes) {
213  auto it = reduced_output_indices.find(graph->outputs()[newo]);
214  if (it == reduced_output_indices.end()) {
215  reduced_output_indices.emplace(graph->outputs()[newo], newo);
216  outputMapAndSizes.emplace_back(newo, osize);
217  newo++;
218  } else {
219  graph->eraseOutput(newo);
220  outputMapAndSizes.emplace_back(it->second, osize);
221  }
222  }
223 }
224 
225 // Performs "upfront" compilation where storage is known but shapes are not.
226 // Currently identifies how to expand all tensors so that all intermediate
227 // tensors are the same shape, simplifying code generation.
228 // Broadcast groups and chunks are identified without shape information
229 // using logical properties of how each works. In particular, tensors
230 // are always expandable to the outputs of pointwise operations they
231 // or their descendants are involved in, which means that in a DAG of
232 // pointwise operations all tensors are expandable to the (single) output.
233 // Note: The logic is slightly complicated by concatenation and chunking.
234 static void upfrontCompilation(KernelSpec& spec) {
235  setInputBroadcastGroups(spec);
236  setInputChunkDescriptors(spec);
237  processGradSumToSize(spec);
238 }
239 
240 int64_t registerFusion(const Node* fusion_group) {
241  auto graph = normalizeGraphForCache(fusion_group->g(attr::Subgraph));
242 
243  // Don't re-register the fusion if we can use a pre-existing one
244  const auto maybe_spec = lookupGraph(graph);
245  if (maybe_spec) {
246  return (*maybe_spec)->key();
247  }
248 
249  // Unconditionally create and register the fusion
250  // This is necessary to support our global disable fusions flag: if someone
251  // runs some code under no-fusions mode and then runs some code with fusions
252  // enabled, the second time around the returned spec from the cache should
253  // be a valid spec (must have had upfrontCompilation run on it).
254  const auto key = store(graph);
255  const auto maybe_retrieved_spec = retrieve(key);
256  AT_ASSERT(maybe_retrieved_spec);
257  upfrontCompilation(**maybe_retrieved_spec);
258 
259  return key;
260 }
261 
262 std::shared_ptr<FusedKernel> compileKernel(
263  const KernelSpec& spec,
264  const ArgSpec& arg_spec,
265  const std::vector<int64_t>& map_size,
266  const at::Device device) {
267  const std::vector<TensorDesc>& input_desc = arg_spec.descs();
268 
269  auto graph = spec.graph()->copy();
270 
271  for (size_t i = 0; i < input_desc.size(); i++) {
272  const auto& desc = input_desc[i];
273  graph->inputs()[i]->setType(DimensionedTensorType::create(
274  desc.scalar_type,
275  device,
276  desc.nDim())); // TODO: nDim is bad, as it is collapsed
277  }
278 
279  PropagateInputShapes(graph);
280 
281  // Creates chunk and flattened input descriptions
282  std::vector<PartitionDesc> chunk_desc;
283  std::vector<std::pair<const Value*, const TensorDesc>> flat_inputs;
284  {
285  size_t input_index = 0;
286  for (const auto& p : graph->inputs()) {
287  if (!p->type()->isSubtypeOf(TensorType::get())) {
288  continue;
289  }
290  if (const Node* chunk = usedInFusedChunk(p)) {
291  int64_t dim = chunk->i(attr::dim);
292  int64_t chunks = chunk->i(attr::chunks);
293  chunk_desc.emplace_back(input_desc[input_index++], chunks, dim);
294  for (const auto* o : chunk->outputs()) {
295  flat_inputs.emplace_back(o, *chunk_desc.back().subTensorDesc());
296  }
297  } else {
298  chunk_desc.emplace_back();
299  flat_inputs.emplace_back(p, input_desc[input_index++]);
300  }
301  }
302  }
303 
304  // Creates output, concat, and flattened output descriptions
305  std::vector<TensorDesc> output_desc;
306  std::vector<PartitionDesc> concat_desc;
307  std::vector<std::pair<const Value*, const TensorDesc>> flat_outputs;
308  for (const Value* o : graph->outputs()) {
309  // Creates output description
310  std::vector<int64_t> sizes = map_size;
311  if (o->node()->kind() == prim::FusedConcat) {
312  sizes.at(o->node()->i(attr::dim)) *= o->node()->inputs().size();
313  }
314  auto scalar_type = o->type()->expect<c10::DimensionedTensorType const>()->scalarType();
315  auto type = CompleteTensorType::create(scalar_type, device, sizes);
316  output_desc.emplace_back(type);
317  const auto& desc = output_desc.back();
318 
319  // Creates concat and flattened output descriptions (relies on output desc)
320  if (o->node()->kind() != prim::FusedConcat) {
321  concat_desc.emplace_back();
322  flat_outputs.emplace_back(o, desc);
323  } else {
324  const auto cat = o->node();
325  concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
326  for (const auto& c : cat->inputs()) {
327  flat_outputs.emplace_back(c, *concat_desc.back().subTensorDesc());
328  }
329  }
330  }
331 
332  const std::string name = "kernel_" + std::to_string(next_kernel_id++);
333  const bool use_cuda = device.is_cuda();
334  std::string code =
335  generateKernel(name, *graph, flat_inputs, flat_outputs, use_cuda);
336  const FusedKernelConstructor& kernel_ctor =
337  getConstructor(use_cuda ? at::DeviceType::CUDA : at::DeviceType::CPU);
338  return kernel_ctor(
339  device.index(),
340  name,
341  code,
342  input_desc,
343  output_desc,
344  chunk_desc,
345  concat_desc,
346  spec.hasRandom());
347 }
348 
349 } // namespace fuser
350 } // namespace jit
351 } // namespace torch
bool is_cuda() const noexcept
Return true if the device is of CUDA type.
Definition: Device.h:80
Represents a a compute device on which a tensor is located.
Definition: Device.h:30
Definition: jit_type.h:17
DeviceIndex index() const noexcept
Returns the optional index.
Definition: Device.h:70