1 #include "caffe2/operators/fully_connected_op.h" 5 #include "caffe2/operators/fc_inference.h" 9 REGISTER_CPU_OPERATOR(
FC, FullyConnectedOp<CPUContext>);
10 REGISTER_CPU_GRADIENT_OPERATOR(
12 FullyConnectedGradientOp<CPUContext>);
14 REGISTER_CPU_OPERATOR(
20 REGISTER_CPU_GRADIENT_OPERATOR(
22 FullyConnectedGradientOp<
28 std::vector<TensorShape> FCGradientShapeInference(
29 const OperatorDef& def,
30 const vector<TensorShape>& in,
31 bool pretransposed_weight) {
32 vector<TensorShape> out(2);
33 ArgumentHelper helper(def);
35 auto axis_w = helper.GetSingleArgument<int32_t>(
"axis_w", 1);
36 const int canonical_axis_w =
37 canonical_axis_index_(axis_w, in[1].dims().size());
38 const int N = pretransposed_weight
40 : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
42 vector<int> dW_shape(in[1].dims().begin(), in[1].dims().end());
43 out[0] = CreateTensorShape(dW_shape, in[1].data_type());
44 out[1] = CreateTensorShape(vector<int>{N}, in[1].data_type());
45 if (def.output_size() == 3) {
46 vector<int> dX_shape(in[0].dims().begin(), in[0].dims().end());
47 out.push_back(CreateTensorShape(dX_shape, in[0].data_type()));
52 OpSchema::Cost CostInferenceForFCGradient(
53 const OperatorDef& def,
54 const vector<TensorShape>& in,
55 bool pretransposed_weight) {
56 struct OpSchema::Cost c;
57 ArgumentHelper helper(def);
58 std::vector<TensorShape> out =
59 FCGradientShapeInference(def, in, pretransposed_weight);
61 CAFFE_ENFORCE_LT(0, out.size());
62 const TensorShape dW = out[0];
63 const TensorShape db = out[1];
65 auto axis = helper.GetSingleArgument<int32_t>(
"axis", 1);
66 const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
67 const uint64_t
M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
68 const uint64_t K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
69 auto axis_w = helper.GetSingleArgument<int32_t>(
"axis_w", 1);
70 const int canonical_axis_w =
71 canonical_axis_index_(axis_w, in[1].dims().size());
72 const uint64_t N = pretransposed_weight
73 ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
74 : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));
76 uint64_t size_dW = nElemFromDim(dW);
77 uint64_t size_db = nElemFromDim(db);
79 c.flops = M * N * (2 * K + 1);
80 c.bytes_written = (size_dW + size_db) *
sizeof(
float);
81 c.params_bytes = (K * N + N) *
sizeof(
float);
83 if (out.size() == 3) {
84 const TensorShape dX = out[2];
85 uint64_t size_dX = nElemFromDim(dX);
87 c.flops += 2 * M * N * K;
88 c.bytes_written += size_dX *
sizeof(float);
96 OPERATOR_SCHEMA(FCTransposed)
99 .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2,
true))
100 .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2,
true))
102 Same as FC, but weight matrix is supposed to be already pretransposed. 103 FCTransposed stands for calling blass with no noTrans, noTrans 105 .InheritOnnxSchema(); 110 .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
111 .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2,
false))
113 The FC operator computes an output $(Y)$ as a linear combination of the input data blob $(X)$ with a weight blob $(W)$ and bias blob $(b)$. More formally, 117 Here, $X$ is a matrix of shape $(M,K)$, $W$ is a matrix of shape $(N,K)$, $b$ is a vector of length $N$, and $Y$ is a matrix of shape $(M,N)$. $N$ can be thought of as the number of nodes in the layer, $M$ is the batch size, and $K$ is the number of features in an input observation. 119 *NOTE: $X$ does not need to explicitly be a 2-dimensional matrix, however, if it is not it will be coerced into one. For an arbitrary $n$-dimensional tensor $X$, e.g. $[a_0, a_1, \ldots ,a_{k-1}, a_k, \ldots , a_{n-1}]$, where $a_i$ in $N$, and $k$ is the $axis$ arg provided, then $X$ will be coerced into a 2-dimensional tensor with dimensions $[a_0 * \ldots * a_{k-1}, a_k * \ldots * a_{n-1}]$. For the default case where axis=1, this means the $X$ tensor will be coerced into a 2D tensor of dimensions $[a_0, a_1 * \ldots * a_{n-1}]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = M$ and $a_1 * \ldots * a_{n-1} = K$. Lastly, even though $b$ is a vector of length $N$, it is copied and resized to shape $(M x N)$ implicitly, then added to each vector in the batch.* 122 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.h 123 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.cc 127 <summary> <b>Example</b> </summary> 133 // In this example, our batch size is 1 (M=1), the input observation will have 134 // 6 features (K=6), and the layer will have one hidden node (N=1). The 135 // expected output is Y=7. 136 workspace.ResetWorkspace() 138 op = core.CreateOperator( 145 data = np.array([1,2,3,4,5,6]).astype(np.float32) 146 data = data[np.newaxis,:] 149 weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32) 150 weights = weights[np.newaxis,:] 153 bias = np.array([1.]).astype(np.float32) 155 // Put the inputs into the workspace 156 workspace.FeedBlob("X", data) 157 workspace.FeedBlob("W", weights) 158 workspace.FeedBlob("b", bias) 161 workspace.RunOperatorOnce(op) 162 print("Y:\n", workspace.FetchBlob("Y")) 180 "*(type: int; default: 1)* Describes the axis of the input data $X$. Defaults to one because in the common case when the input $X$ has shape $(M,K)$, the first axis encodes the batch size.")
183 "*(type: int; default: 1)* Describes the axis of the input weight matrix $W$. Defaults to one because the first axis most likely describes the batch_size.")
186 "*(type: bool; default: False)* Whether to use float-16 compute kernel.")
190 "Input blob to be coerced into a 2D matrix of shape $(M,K)$, where $M$ is the batch size and $K$ is the number of features in a single observation.")
194 "Input blob to be coerced into a 2D matrix of shape $(N,K)$ describing a fully connected weight matrix. Here, $K$ is the number of features in a single observation and $N$ is the number of nodes in the FC layer.")
198 "Input blob containing vector of length $N$ which describes one bias for each node in the layer.")
202 "Output blob containing a 2D output matrix of shape $(M,N)$, where $M$ is the batch size and $N$ is the number of nodes in the layer. The output is calculated as $Y=XW^T+b$.")
203 .InheritOnnxSchema(
"Gemm");
205 GRADIENT_OPERATOR_SCHEMA(FCGradient)
208 .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2,
false))
209 .CostInferenceFunction(
210 std::bind(CostInferenceForFCGradient, _1, _2,
false));
211 GRADIENT_OPERATOR_SCHEMA(FCTransposedGradient)
214 .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2,
false))
215 .CostInferenceFunction(
216 std::bind(CostInferenceForFCGradient, _1, _2,
false));
221 using GradientMakerBase::GradientMakerBase;
223 std::vector<OperatorDef> GetGradientDefs()
override {
224 CAFFE_ENFORCE_EQ(def_.input_size(), 3);
225 CAFFE_ENFORCE(def_.type() ==
"FC" || def_.type() ==
"FCTransposed");
226 return SingleGradientDef(
227 def_.type() +
"Gradient",
229 vector<string>{I(0), I(1), GO(0)},
230 vector<string>{GI(1), GI(2), GI(0)});
int64_t size_from_dim_(int k, IntArrayRef dims)
Return product of all dimensions starting from k.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...