1 #include "caffe2/core/net_simple.h" 2 #include "caffe2/core/net.h" 6 #include <unordered_map> 7 #include <unordered_set> 9 #include "caffe2/core/operator.h" 10 #include "caffe2/core/static_tracepoint.h" 11 #include "caffe2/core/timer.h" 12 #include "caffe2/proto/caffe2_pb.h" 13 #include "caffe2/utils/proto_utils.h" 16 caffe2_simple_net_benchmark_run_whole_net,
18 "If false, whole net passes won't be performed");
23 const std::shared_ptr<const NetDef>& net_def,
25 : NetBase(net_def, ws) {
26 VLOG(1) <<
"Constructing SimpleNet " << net_def->name();
27 const bool net_def_has_device_option = net_def->has_device_option();
29 for (
int idx = 0; idx < net_def->op_size(); ++idx) {
30 const auto& operator_def = net_def->op(idx);
31 VLOG(1) <<
"Creating operator " << operator_def.name() <<
": " 32 << operator_def.type();
33 std::unique_ptr<OperatorBase> op{
nullptr};
34 if (!operator_def.has_device_option() && net_def_has_device_option) {
38 OperatorDef temp_def(operator_def);
39 temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
40 op = CreateOperator(temp_def, ws, idx);
42 op = CreateOperator(operator_def, ws, idx);
44 std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
46 operators_.emplace_back(std::move(op));
50 bool SimpleNet::Run() {
52 VLOG(1) <<
"Running net " << name_;
53 for (
auto& op : operators_) {
54 VLOG(1) <<
"Running operator " << op->debug_def().name() <<
"(" 55 << op->debug_def().type() <<
").";
56 #ifdef CAFFE2_ENABLE_SDT 57 const auto& op_name = op->debug_def().name().c_str();
58 const auto& op_type = op->debug_def().type().c_str();
59 auto* op_ptr = op.get();
60 const auto& net_name = name_.c_str();
61 CAFFE_SDT(operator_start, net_name, op_name, op_type, op_ptr);
64 #ifdef CAFFE2_ENABLE_SDT 65 CAFFE_SDT(operator_done, net_name, op_name, op_type, op_ptr);
68 LOG(ERROR) <<
"Operator failed: " << ProtoDebugString(op->debug_def());
76 bool SimpleNet::RunAsync() {
81 template <
typename A,
typename B>
82 bool PairLargerThan(
const std::pair<A, B>& x,
const std::pair<A, B>& y) {
83 return x.second > y.second;
88 const int warmup_runs,
90 const bool run_individual) {
92 std::cout <<
"Starting benchmark." << std::endl;
93 std::cout <<
"Running warmup runs." << std::endl;
96 "Number of warm up runs should be non negative, provided ",
99 for (
int i = 0; i < warmup_runs; ++i) {
100 CAFFE_ENFORCE(Run(),
"Warmup run ", i,
" has failed.");
103 std::cout <<
"Main runs." << std::endl;
106 "Number of main runs should be non negative, provided ",
111 if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
112 for (
int i = 0; i < main_runs; ++i) {
113 CAFFE_ENFORCE(Run(),
"Main run ", i,
" has failed.");
116 std::cout <<
"Main run finished. Milliseconds per iter: " 117 << millis / main_runs
118 <<
". Iters per second: " << 1000.0 * main_runs / millis
121 vector<float> time_per_op(operators_.size(), 0);
122 vector<uint64_t> flops_per_op;
123 vector<uint64_t> memory_bytes_read_per_op;
124 vector<uint64_t> memory_bytes_written_per_op;
125 vector<uint64_t> param_bytes_per_op;
126 CaffeMap<string, float> time_per_op_type;
127 CaffeMap<string, float> flops_per_op_type;
128 CaffeMap<string, float> memory_bytes_read_per_op_type;
129 CaffeMap<string, float> memory_bytes_written_per_op_type;
130 CaffeMap<string, float> param_bytes_per_op_type;
131 if (run_individual) {
132 for (
int i = 0; i < main_runs; ++i) {
133 for (
auto& op : operators_) {
137 for (
auto& op : operators_) {
138 const string& op_type = op->debug_def().type();
140 auto* schema = OpSchemaRegistry::Schema(op_type);
141 if (schema && schema->HasCostInferenceFunction()) {
142 vector<TensorShape> shapes = op->InputTensorShapes();
144 auto all_good_shapes = std::accumulate(
148 [](
bool acc,
const TensorShape& shape) {
149 return acc && !shape.unknown_shape();
152 if (all_good_shapes) {
153 cost = schema->InferCost(op->debug_def(), shapes);
156 flops_per_op.emplace_back(cost.flops);
157 memory_bytes_read_per_op.emplace_back(cost.bytes_read);
158 memory_bytes_written_per_op.emplace_back(cost.bytes_written);
159 param_bytes_per_op.emplace_back(cost.params_bytes);
161 flops_per_op_type[op_type] += cost.flops;
162 memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
163 memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
164 param_bytes_per_op_type[op_type] += cost.params_bytes;
166 flops_per_op.emplace_back(0);
167 memory_bytes_read_per_op.emplace_back(0);
168 memory_bytes_written_per_op.emplace_back(0);
169 param_bytes_per_op.emplace_back(0);
176 op->debug_def().name(),
181 time_per_op[idx] += spent;
182 time_per_op_type[op_type] += spent;
187 for (
auto& op : operators_) {
188 const string& op_type = op->debug_def().type();
189 const string& print_name =
190 (op->debug_def().name().size()
191 ? op->debug_def().name()
192 : (op->debug_def().output_size() ? op->debug_def().output(0)
194 std::stringstream flops_str;
195 if (idx < flops_per_op.size() && flops_per_op[idx]) {
196 flops_str <<
" (" << to_string(1.0e-9 * flops_per_op[idx]) <<
" GFLOP, " 198 1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
202 std::stringstream memory_bytes_read_str;
203 if (idx < memory_bytes_read_per_op.size() &&
204 memory_bytes_read_per_op[idx]) {
205 memory_bytes_read_str
206 <<
" (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx])
209 std::stringstream memory_bytes_written_str;
210 if (idx < memory_bytes_written_per_op.size() &&
211 memory_bytes_written_per_op[idx]) {
212 memory_bytes_written_str
213 <<
" (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
216 std::stringstream param_bytes_str;
217 if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
218 param_bytes_str <<
" (" << to_string(1.0e-6 * param_bytes_per_op[idx])
221 std::cout <<
"Operator #" << idx <<
" (" << print_name <<
", " << op_type
222 <<
") " << time_per_op[idx] / main_runs <<
" ms/iter" 223 << flops_str.str() << memory_bytes_written_str.str()
224 << param_bytes_str.str() << std::endl;
227 const std::vector<string> metric({
"Time",
229 "Feature Memory Read",
230 "Feature Memory Written",
231 "Parameter Memory"});
232 const std::vector<double> normalizer(
233 {1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
234 const std::vector<string> unit({
"ms",
"GFLOP",
"MB",
"MB",
"MB"});
236 std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
237 metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
238 metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
239 metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
240 metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
241 metric_per_op_type_vec_vec.emplace_back(¶m_bytes_per_op_type);
242 for (
size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
243 std::cout << metric[i] <<
" per operator type:" << std::endl;
244 auto* item = metric_per_op_type_vec_vec[i];
245 std::vector<std::pair<string, float>> metric_per_op_type_vec(
246 (*item).begin(), (*item).end());
248 metric_per_op_type_vec.begin(),
249 metric_per_op_type_vec.end(),
250 PairLargerThan<string, float>);
251 float total_metric = 0.;
252 for (
const auto& op_item : metric_per_op_type_vec) {
253 total_metric += op_item.second * normalizer[i];
255 for (
const auto& op_item : metric_per_op_type_vec) {
257 if (total_metric > 0.) {
258 percent = (100.0 * op_item.second * normalizer[i] / total_metric);
260 std::cout << std::setw(15) << std::setfill(
' ')
261 << op_item.second * normalizer[i] <<
" " << unit[i] <<
". " 262 << std::setw(10) << std::setfill(
' ') << percent <<
"%. " 263 << op_item.first << std::endl;
265 std::cout << std::setw(15) << std::setfill(
' ') << total_metric <<
" " 266 << unit[i] <<
" in Total" << std::endl;
270 for (
size_t i = 0; i < time_per_op.size(); ++i) {
271 time_per_op[i] /= main_runs;
273 if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
274 time_per_op.insert(time_per_op.begin(), millis / main_runs);
void Start()
Starts a timer.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
float MilliSeconds()
Returns the elapsed time in milliseconds.
vector< float > TEST_Benchmark(const int warmup_runs, const int main_runs, const bool run_individual) override
Benchmarks a network.
A simple timer object for measuring time.