Caffe2 - C++ API
A deep learning, cross platform ML framework
kernel_spec.h
1 #pragma once
2 
3 #include <ATen/ATen.h>
4 #include <c10/util/Optional.h>
5 #include <torch/csrc/WindowsTorchApiMacro.h>
6 #include <torch/csrc/jit/fuser/arg_spec.h>
7 #include <torch/csrc/jit/fuser/fused_kernel.h>
8 #include <torch/csrc/jit/fuser/interface.h>
9 #include <torch/csrc/jit/interpreter.h>
10 #include <torch/csrc/jit/ir.h>
11 #include <ATen/core/stack.h>
12 
13 #include <cstdint>
14 #include <memory>
15 #include <mutex>
16 #include <unordered_map>
17 #include <vector>
18 
19 namespace torch {
20 namespace jit {
21 namespace fuser {
22 
23 // Helper struct containing partition information: the number of tensors
24 // created and the dimension the partitioning is performed on.
25 // Note: created during upfront compilation, once the tensors are known
26 // at runtime the partition info is logically combined with the tensor
27 // descriptions to create PartitionDesc objects.
28 struct TORCH_API PartitionInfo {
29  PartitionInfo(const int64_t _nSubTensors, const int64_t _dim)
30  : nSubTensors_{_nSubTensors}, dim_{_dim} {};
31 
32  int64_t nSubTensors() const {
33  return nSubTensors_;
34  }
35  int64_t dim() const {
36  return dim_;
37  }
38 
39  private:
40  int64_t nSubTensors_;
41  int64_t dim_;
42 };
43 
44 // This is a helper struct to record the following:
45 // for each fusion group output, it records the corresponding
46 // kernel output offset (in offset) and the fusion group input
47 // to that is to be applied with sumtosize on the output (if any).
48 // This mapping is necessar as a single kernel output might be
49 // summed to different sizes.
50 // These mappings are created during compilation in processGradSumToSize.
51 struct TORCH_API OutputMapAndSize {
52  OutputMapAndSize(const int64_t _offset, const int64_t _sizeInput)
53  : offset_{_offset}, sizeInput_{_sizeInput} {};
54 
55  int64_t offset() const {
56  return offset_;
57  }
58  int64_t sizeInput() const {
59  return sizeInput_;
60  }
61  bool needsSumToSize() const {
62  return sizeInput_ != -1;
63  }
64 
65  private:
66  int64_t offset_;
67  int64_t sizeInput_;
68 };
69 
70 // "Kernel Specification." - Contains device-independent fusion information.
71 // Each kernel specification contains a map of instantiated generated functions
72 // that implement some or most of its functionality. Multiple generated
73 // functions are needed by each abstract specification because of different
74 // devices (cpu vs gpu, different gpus) and different inputs (int vs float,
75 // contiguous vs discontiguous).
76 // Note: uses a mutex to control access to its kernel store
77 // Note: unordered containers do not invalidate references/pointers on
78 // rehashing, which is critical for thread-safety.
79 // TODO: allow abstract kernels to use multiple generated kernels
80 // TODO: allow abstract kernels to reuse generated kernels from common pool
81 struct TORCH_API KernelSpec {
82  // Note: assumes the spec is a single block
83  // Note: This is the appropriate place to generalize if you want to add other
84  // passes to upfront compilation that walk the graph.
85  KernelSpec(const int64_t _key, const std::shared_ptr<Graph>& _graph)
86  : key_{_key},
87  graph_{_graph},
88  code_{_graph},
89  nInputs_{_graph->inputs().size()},
90  nTensorInputs_{},
91  inputBroadcastGroups_{},
92  inputChunks_{},
93  outputMapAndSizes_{},
94  has_random_{false},
95  kernels_{} {
96  for (const auto& n : graph_->nodes()) {
97  if (n->kind() == aten::rand_like) {
98  has_random_ = true;
99  break;
100  }
101  }
102  nTensorInputs_ = std::count_if(
103  graph_->inputs().begin(), graph_->inputs().end(), [](const Value* v) {
104  return v->type()->isSubtypeOf(TensorType::get());
105  });
106  }
107 
108  // Getters
109  int64_t key() const {
110  return key_;
111  }
112  std::shared_ptr<Graph> graph() const {
113  return graph_;
114  }
115  const Code& code() const {
116  return code_;
117  }
118  int64_t nInputs() const {
119  return nInputs_;
120  }
121  int64_t nTensorInputs() const {
122  return nTensorInputs_;
123  }
124 
125  std::vector<std::vector<int64_t>>& inputBroadcastGroups() {
126  return inputBroadcastGroups_;
127  }
128  const std::vector<std::vector<int64_t>>& inputBroadcastGroups() const {
129  return inputBroadcastGroups_;
130  }
131 
132  std::vector<PartitionInfo>& inputChunks() {
133  return inputChunks_;
134  }
135  const std::vector<PartitionInfo>& inputChunks() const {
136  return inputChunks_;
137  }
138 
139  std::vector<OutputMapAndSize>& outputMapAndSizes() {
140  return outputMapAndSizes_;
141  }
142 
143  bool hasRandom() const {
144  return has_random_;
145  }
146 
147  // Cache functions
149  const ArgSpec& arg_spec) const {
150  std::lock_guard<std::mutex> guard{mutex_};
151  const auto it = kernels_.find(arg_spec);
152  if (it == kernels_.end())
153  return c10::nullopt;
154  return it->second;
155  }
156  void cacheKernel(const ArgSpec& arg_spec, std::shared_ptr<FusedKernel> kernel)
157  const {
158  std::lock_guard<std::mutex> guard{mutex_};
159  kernels_.emplace(arg_spec, kernel);
160  }
161 
162  private:
163  int64_t key_;
164  std::shared_ptr<Graph> graph_;
165  Code code_;
166  uint64_t nInputs_;
167  uint64_t nTensorInputs_;
168  std::vector<std::vector<int64_t>> inputBroadcastGroups_;
169  std::vector<PartitionInfo> inputChunks_;
170  // This will initially be an empty vector. During kernel compilation
171  // in processGradSumToSize it will be filled and will contain one
172  // element per fusion group output (which may be larger than the
173  // number of kernel outputs).
174  std::vector<OutputMapAndSize> outputMapAndSizes_;
175  bool has_random_;
176  mutable std::mutex mutex_;
177  mutable std::
178  unordered_map<ArgSpec, std::shared_ptr<FusedKernel>, torch::hash<ArgSpec>>
179  kernels_;
180 };
181 
182 } // namespace fuser
183 } // namespace jit
184 } // namespace torch
Definition: jit_type.h:17